aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv/depthwise/kernels
diff options
context:
space:
mode:
authorMichael Tyler <michael.tyler@arm.com>2022-12-15 12:39:29 +0000
committermichael.tyler <michael.tyler@arm.com>2023-01-16 09:31:00 +0000
commitba209750abc1ac7e42bab9fef5db284384d70fb3 (patch)
tree1065f242db9a9a5e48bd4a9f2fd68aef1924827a /src/core/NEON/kernels/arm_conv/depthwise/kernels
parent8094f9dd5307c55f545b2cb41ec80a739a9b4d6f (diff)
downloadComputeLibrary-ba209750abc1ac7e42bab9fef5db284384d70fb3.tar.gz
Update CPU kernels to remove x19
Resolves: COMPMID-5805 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Change-Id: I250f64531e209625e4ff176dd5a552c1c34bc484 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8909 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/depthwise/kernels')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp622
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp502
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp932
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1212
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1882
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp816
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp780
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp1176
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1048
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp739
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1631
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp672
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp670
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp1126
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1214
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1837
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp2044
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp788
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp820
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp832
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp726
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp517
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp283
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp247
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1391
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1764
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1866
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3374
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp946
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp866
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp1084
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2512
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2738
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1764
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1866
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3374
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp946
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp866
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp1084
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1704
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2860
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1764
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1866
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3374
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp946
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp2624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp294
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp224
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp328
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp362
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp478
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp200
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp268
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp700
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp986
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp1450
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp486
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp754
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp998
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp1296
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp696
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_2rows_dot_za/generic.cpp592
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp836
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp1020
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp1306
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp696
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_2rows_dot_za/generic.cpp592
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp836
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1020
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1306
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp696
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za/generic.cpp592
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp836
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1020
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1306
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp404
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp400
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp652
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp712
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1050
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1172
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp448
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp788
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp874
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp255
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp404
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp400
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp247
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp652
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp712
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1050
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1172
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp448
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp788
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp874
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp194
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp352
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp610
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp752
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp848
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp620
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1018
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp548
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp646
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp704
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp848
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp620
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1018
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp548
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp646
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp596
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp620
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1014
125 files changed, 62495 insertions, 64028 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index a85e44360e..f4027df375 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,198 +87,198 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x23, #0x0\n"
"mov x22, #0x0\n"
- "mov x21, #0x0\n"
"1:" // Tile loop
- "str x22, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x2\n"
"mov x26, #0x2\n"
- "mov x25, #0x2\n"
- "str x21, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x22, x24\n" // offset = tile_i * ld_input_row
- "ldr x14, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x22, x23\n" // offset = tile_i * ld_output_row
- "mov x22, #0x10\n" // cntb _, ALL, #1
- "madd x20, x21, x14, x20\n" // offset += tile_j * ld_input_col
- "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x14, x14, #0x1\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x19, x21, x13, x19\n" // offset += tile_j * ld_output_col
- "lsr x21, %x[n_channels], #0x3\n"
- "add x10, x14, x14\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x20, x20, x26\n" // offset *= kernel_stride * output_size
- "add x12, x12, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x28, x12, x24, LSL #1\n"
- "mul x19, x19, x25\n" // offset *= output_tile_size
- "add x27, x28, x24, LSL #1\n"
- "add x11, x11, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x15, x15, #0x1\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x11, x15, x15\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x13, x13, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x9, x13, x25, LSL #1\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x28, x9, x25, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
- "add x26, x27, x24, LSL #1\n"
- "add x25, x10, x14\n"
- "add x24, x11, x23, LSL #1\n"
- "lsl x13, x13, #0x1\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
- "cbz x21, 4f\n"
- "ldr q16, [x9, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q1, [x9, #0x20]\n"
- "ldr q2, [x9, #0x30]\n"
- "ldr q3, [x9, #0x40]\n"
- "ldr q4, [x9, #0x50]\n"
- "ldr q5, [x9, #0x60]\n"
- "ldr q6, [x9, #0x70]\n"
- "ldr q7, [x9, #0x80]\n"
- "ldr q8, [x9, #0x90]\n"
- "ldr q9, [x28, x14]\n"
- "add x9, x9, #0xa0\n"
- "ld1 { v10.8h }, [x12]\n"
- "ldr q11, [x12, x25]\n"
- "ldr q12, [x28, x10]\n"
- "ldr q13, [x27, x14]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x27, x28, x25, LSL #1\n"
+ "add x26, x11, x15\n"
+ "add x25, x12, x24, LSL #1\n"
+ "lsl x14, x14, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q16, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "add x10, x10, #0xa0\n"
+ "ldr q9, [x9, x15]\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "ldr q11, [x13, x26]\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "add x22, x22, #0x10\n"
- "cmp x22, x21, LSL #4\n"
+ "add x23, x23, #0x10\n"
+ "cmp x23, x22, LSL #4\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x26]\n"
- "add x19, x19, #0x10\n"
+ "ld1 { v9.8h }, [x27]\n"
+ "ldr q16, [x10, #0x0]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "ldr q10, [x27, x10]\n"
+ "ldr q11, [x27, x26]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"add x20, x20, #0x10\n"
- "ldr q16, [x9, #0x0]\n"
+ "add x21, x21, #0x10\n"
"fmla v28.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x14]\n"
+ "ldr q12, [x13, x15]\n"
"fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x13, x11]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q9, [x12, x10]\n"
- "add x12, x12, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v29.8h, v6.8h, v13.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x28]\n"
+ "ld1 { v11.8h }, [x9]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x25]\n"
- "add x28, x28, #0x10\n"
+ "ldr q12, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x9, #0x50]\n"
+ "ldr q4, [x10, #0x50]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
- "ldr q1, [x9, #0x20]\n"
+ "ld1 { v9.8h }, [x28]\n"
+ "ldr q1, [x10, #0x20]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x10, #0x10]\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q2, [x9, #0x30]\n"
+ "ldr q2, [x10, #0x30]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x27, x25]\n"
- "add x27, x27, #0x10\n"
+ "ldr q10, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q13, [x28, x15]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
- "ldr q13, [x27, x14]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x14]\n"
+ "ldr q11, [x27, x15]\n"
+ "ldr q3, [x10, #0x40]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x26, x10]\n"
+ "ldr q12, [x27, x11]\n"
+ "ldr q5, [x10, #0x60]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x26, x26, #0x10\n"
- "ldr q11, [x12, x25]\n"
+ "ldr q11, [x13, x26]\n"
"fmla v28.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x9, x15]\n"
"fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "ldr q9, [x28, x14]\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "ldr q6, [x10, #0x70]\n"
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "ld1 { v10.8h }, [x12]\n"
+ "ldr q8, [x10, #0x90]\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "ldr q12, [x28, x10]\n"
- "ldr q3, [x9, #0x40]\n"
+ "add x27, x27, #0x10\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x11]\n"
- "ldr q5, [x9, #0x60]\n"
+ "st1 { v28.8h }, [x12]\n"
+ "add x10, x10, #0xa0\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x11, x13]\n"
- "add x11, x11, #0x10\n"
- "st1 { v30.8h }, [x24]\n"
- "ldr q6, [x9, #0x70]\n"
- "ldr q7, [x9, #0x80]\n"
- "str q31, [x24, x13]\n"
- "add x24, x24, #0x10\n"
- "ldr q8, [x9, #0x90]\n"
- "add x9, x9, #0xa0\n"
+ "str q29, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v30.8h }, [x25]\n"
+ "str q31, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ld1 { v9.8h }, [x26]\n"
+ "ld1 { v9.8h }, [x27]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x26, x25]\n"
- "ldr q10, [x27, x10]\n"
+ "ldr q11, [x27, x26]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x12, x14]\n"
+ "ldr q12, [x13, x15]\n"
"fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x13, x11]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q9, [x12, x10]\n"
- "add x12, x12, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v29.8h, v6.8h, v13.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "ld1 { v11.8h }, [x28]\n"
+ "ld1 { v11.8h }, [x9]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x28, x25]\n"
- "add x28, x28, #0x10\n"
+ "ldr q12, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ld1 { v9.8h }, [x27]\n"
+ "ld1 { v9.8h }, [x28]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x27, x25]\n"
- "add x27, x27, #0x10\n"
+ "ldr q10, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x26, x14]\n"
+ "ldr q11, [x27, x15]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x26, x10]\n"
+ "ldr q12, [x27, x11]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v29.8h, v8.8h, v10.8h\n"
"fmax v28.8h, v28.8h, v18.8h\n"
@@ -289,83 +289,83 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x11]\n"
+ "st1 { v28.8h }, [x12]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x11, x13]\n"
- "add x11, x11, #0x10\n"
- "st1 { v30.8h }, [x24]\n"
- "str q31, [x24, x13]\n"
- "add x24, x24, #0x10\n"
+ "str q29, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v30.8h }, [x25]\n"
+ "str q31, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 57f\n"
- "ldr q16, [x9, #0x0]\n"
- "ldr q0, [x9, #0x10]\n"
- "ldr q1, [x9, #0x20]\n"
- "ldr q2, [x9, #0x30]\n"
- "add x23, x28, x14\n"
- "add x22, x12, XZR\n"
- "ldr q3, [x9, #0x40]\n"
- "ldr q4, [x9, #0x50]\n"
- "add x21, x12, x25\n"
- "add x20, x28, x10\n"
- "ldr q5, [x9, #0x60]\n"
- "ldr q6, [x9, #0x70]\n"
- "add x19, x27, x14\n"
- "ldr q7, [x9, #0x80]\n"
- "ldr q8, [x9, #0x90]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "add x24, x9, x15\n"
+ "add x23, x13, XZR\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "add x22, x13, x26\n"
+ "add x21, x9, x11\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "add x20, x28, x15\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
- "ldr d9, [x23], #0x8\n"
- "ldr d10, [x22], #0x8\n"
- "ldr d11, [x21], #0x8\n"
- "ldr d12, [x20], #0x8\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v9.s }[2], [x23], #0x4\n"
- "ld1 { v10.s }[2], [x22], #0x4\n"
- "ld1 { v11.s }[2], [x21], #0x4\n"
- "ld1 { v12.s }[2], [x20], #0x4\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[6], [x23]\n"
- "ld1 { v10.h }[6], [x22]\n"
- "ld1 { v11.h }[6], [x21]\n"
- "ld1 { v12.h }[6], [x20]\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x24]\n"
+ "ld1 { v10.h }[6], [x23]\n"
+ "ld1 { v11.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x21]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 8f\n"
"5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[4], [x23]\n"
- "ld1 { v10.h }[4], [x22]\n"
- "ld1 { v11.h }[4], [x21]\n"
- "ld1 { v12.h }[4], [x20]\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x24]\n"
+ "ld1 { v10.h }[4], [x23]\n"
+ "ld1 { v11.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x21]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 8f\n"
"6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 7f\n"
- "ldr s9, [x23], #0x4\n"
- "ldr s10, [x22], #0x4\n"
- "ldr s11, [x21], #0x4\n"
- "ldr s12, [x20], #0x4\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s10, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[2], [x23]\n"
- "ld1 { v10.h }[2], [x22]\n"
- "ld1 { v11.h }[2], [x21]\n"
- "ld1 { v12.h }[2], [x20]\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v10.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x23, #0x0]\n"
- "ldr h10, [x22, #0x0]\n"
- "ldr h11, [x21, #0x0]\n"
- "ldr h12, [x20, #0x0]\n"
- "ldr h13, [x19, #0x0]\n"
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h10, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "add x19, x26, XZR\n"
+ "add x20, x27, XZR\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
@@ -375,258 +375,258 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 12f\n"
"9:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 12f\n"
"10:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 11f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x19, x26, x25\n"
+ "add x20, x27, x26\n"
"fmla v29.8h, v6.8h, v13.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
"tbz %x[n_channels], #2, 14f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 16f\n"
"13:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 16f\n"
"14:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 15f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
- "add x19, x12, x14\n"
+ "add x20, x13, x15\n"
"tbz %x[n_channels], #2, 18f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 20f\n"
"17:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 20f\n"
"18:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 19f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "add x19, x12, x10\n"
+ "add x20, x13, x11\n"
"tbz %x[n_channels], #2, 22f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 24f\n"
"21:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 24f\n"
"22:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "add x19, x27, x10\n"
+ "add x20, x28, x11\n"
"tbz %x[n_channels], #2, 26f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 28f\n"
"25:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 28f\n"
"26:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 27f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x19, x28, XZR\n"
+ "add x20, x9, XZR\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"tbz %x[n_channels], #2, 30f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 32f\n"
"29:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 32f\n"
"30:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 31f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x19, x28, x25\n"
+ "add x20, x9, x26\n"
"tbz %x[n_channels], #2, 34f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 36f\n"
"33:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 36f\n"
"34:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 35f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v29.8h, v5.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "add x19, x27, XZR\n"
+ "add x20, x28, XZR\n"
"tbz %x[n_channels], #2, 38f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 40f\n"
"37:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 40f\n"
"38:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 39f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x19, x27, x25\n"
+ "add x20, x28, x26\n"
"tbz %x[n_channels], #2, 42f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 44f\n"
"41:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 44f\n"
"42:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 43f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v29.8h, v8.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
- "add x19, x26, x14\n"
+ "add x20, x27, x15\n"
"tbz %x[n_channels], #2, 46f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 48f\n"
"45:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 48f\n"
"46:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 47f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x19, x26, x10\n"
+ "add x20, x27, x11\n"
"tbz %x[n_channels], #2, 50f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 52f\n"
"49:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 52f\n"
"50:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 51f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
@@ -639,82 +639,82 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 54f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.d }[0], [x20], x13\n"
- "add x11, x11, #0x8\n"
- "add x24, x24, #0x8\n"
- "st1 { v30.d }[0], [x19], x13\n"
- "st1 { v29.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
+ "add x12, x12, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 53f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.s }[2], [x20], x13\n"
- "add x11, x11, #0x4\n"
- "add x24, x24, #0x4\n"
- "st1 { v30.s }[2], [x19], x13\n"
- "st1 { v29.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[2], [x21], x14\n"
+ "st1 { v30.s }[2], [x20], x14\n"
+ "add x12, x12, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 56f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.h }[6], [x20], x13\n"
- "st1 { v30.h }[6], [x19], x13\n"
- "st1 { v29.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[6], [x21], x14\n"
+ "st1 { v30.h }[6], [x20], x14\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 56f\n"
"53:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 56f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.h }[4], [x20], x13\n"
- "st1 { v30.h }[4], [x19], x13\n"
- "st1 { v29.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[4], [x21], x14\n"
+ "st1 { v30.h }[4], [x20], x14\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 56f\n"
"54:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 55f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.s }[0], [x20], x13\n"
- "st1 { v30.s }[0], [x19], x13\n"
- "add x11, x11, #0x4\n"
- "add x24, x24, #0x4\n"
- "st1 { v29.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[0], [x21], x14\n"
+ "st1 { v30.s }[0], [x20], x14\n"
+ "add x12, x12, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 56f\n"
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.h }[2], [x20], x13\n"
- "st1 { v30.h }[2], [x19], x13\n"
- "st1 { v29.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[2], [x21], x14\n"
+ "st1 { v30.h }[2], [x20], x14\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x20, x11\n"
- "mov x19, x24\n"
- "st1 { v28.h }[0], [x20], x13\n"
- "st1 { v30.h }[0], [x19], x13\n"
- "st1 { v29.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[0], [x21], x14\n"
+ "st1 { v30.h }[0], [x20], x14\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"56:" // Tile loop: Oddments: Store: Bit 2: End
"57:" // Tile loop: End
- "ldr x21, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x21, #0x1\n"
- "add x20, x22, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x21, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x22, x22, x20, LT\n"
- "csel x21, x21, XZR, LT\n"
- "cmp x22, x19\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x22, x22, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x22, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x22, x22, XZR, LT\n"
+ "cmp x23, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index a0a44997d1..bea4715313 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -83,21 +83,18 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"lsr x15, %x[n_channels], #0x3\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
- "ldp x13, x12, [x21, #0x0]\n"
- "ldp x11, x10, [x21, #0x10]\n"
- "add x9, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
"sub x27, XZR, x16\n"
"cbz x15, 3f\n"
- "ldp x26, x25, [x9, #0x0]\n"
- "ldp x24, x23, [x9, #0x10]\n"
- "ldr x22, [x9, #0x20]\n"
- "cmp x16, x15, LSL #4\n"
"ldr q16, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
+ "cmp x16, x15, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
"ldr q3, [x14, #0x40]\n"
@@ -106,162 +103,165 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
- "ldr q9, [x26, x28]\n"
"add x14, x14, #0xa0\n"
- "ldr q10, [x25, x28]\n"
- "ldr q11, [x24, x28]\n"
- "ldr q12, [x23, x28]\n"
- "ldr q13, [x22, x28]\n"
+ "ldp x26, x22, [x13, #0x0]\n"
+ "ldr q9, [x26, x28]\n"
+ "ldr q10, [x22, x28]\n"
+ "ldp x25, x24, [x13, #0x10]\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr q13, [x23, x28]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x21, [x9, #0x28]\n"
- "ldr x20, [x9, #0x30]\n"
+ "ldr x22, [x13, #0x28]\n"
+ "ldr x21, [x13, #0x30]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x21, x28]\n"
- "ldr x19, [x9, #0x38]\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr q16, [x14, #0x0]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
- "ldr x25, [x9, #0x48]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr x20, [x13, #0x38]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x26, [x9, #0x40]\n"
- "ldr q10, [x25, x28]\n"
+ "ldr x22, [x13, #0x48]\n"
+ "ldr q10, [x22, x28]\n"
"fmla v28.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x19, x28]\n"
- "ldr x24, [x9, #0x50]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x26, [x13, #0x40]\n"
"fmla v30.8h, v6.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
"ldr q9, [x26, x28]\n"
- "ldr x23, [x9, #0x58]\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "ldr x25, [x13, #0x50]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v29.8h, v6.8h, v13.8h\n"
- "ldr x22, [x9, #0x60]\n"
- "ldr x21, [x9, #0x68]\n"
+ "ldr x24, [x13, #0x58]\n"
+ "ldr x23, [x13, #0x60]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x24, x28]\n"
- "ldr x20, [x9, #0x70]\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr x22, [x13, #0x68]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x23, x28]\n"
- "ldr x19, [x9, #0x78]\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldp x26, x25, [x9, #0x0]\n"
- "ldp x24, x23, [x9, #0x10]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr x20, [x13, #0x78]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr x22, [x9, #0x20]\n"
+ "ldr q9, [x23, x28]\n"
+ "ldr q1, [x14, #0x20]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x14, #0x10]\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "ldr q13, [x22, x16]\n"
- "add x27, x27, #0x10\n"
+ "ldr q2, [x14, #0x30]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x21, x28]\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr q10, [x22, x28]\n"
+ "ldp x26, x22, [x13, #0x0]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
+ "ldp x25, x24, [x13, #0x10]\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr q13, [x23, x16]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
+ "ldr q11, [x21, x28]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x19, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr q3, [x14, #0x40]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "ldr q11, [x24, x16]\n"
- "add x28, x28, #0x10\n"
+ "ldr q11, [x25, x16]\n"
+ "ldr q5, [x14, #0x60]\n"
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v29.8h, v8.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
"ldr q9, [x26, x16]\n"
+ "ldr q10, [x22, x16]\n"
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x24, x16]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "ldr q10, [x25, x16]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "ldr q12, [x23, x16]\n"
"add x16, x16, #0x10\n"
- "cmp x16, x15, LSL #4\n"
+ "add x27, x27, #0x10\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x13, x27]\n"
+ "cmp x16, x15, LSL #4\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x27]\n"
- "ldr q2, [x14, #0x30]\n"
- "str q30, [x11, x27]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "str q31, [x10, x27]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
+ "add x28, x28, #0x10\n"
+ "str q28, [x12, x27]\n"
"add x14, x14, #0xa0\n"
+ "str q29, [x11, x27]\n"
+ "str q30, [x10, x27]\n"
+ "str q31, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x21, [x9, #0x28]\n"
- "ldr x20, [x9, #0x30]\n"
+ "ldr x22, [x13, #0x28]\n"
+ "ldr x21, [x13, #0x30]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q9, [x21, x28]\n"
- "ldr x19, [x9, #0x38]\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr x20, [x13, #0x38]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
- "ldr x25, [x9, #0x48]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr x22, [x13, #0x48]\n"
+ "ldr q10, [x22, x28]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x26, [x9, #0x40]\n"
- "ldr q10, [x25, x28]\n"
+ "ldr x26, [x13, #0x40]\n"
"fmla v28.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "ldr q12, [x19, x28]\n"
- "ldr x24, [x9, #0x50]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x25, [x13, #0x50]\n"
"fmla v30.8h, v6.8h, v9.8h\n"
- "fmla v31.8h, v3.8h, v13.8h\n"
"ldr q9, [x26, x28]\n"
- "ldr x23, [x9, #0x58]\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "ldr x24, [x13, #0x58]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v29.8h, v6.8h, v13.8h\n"
- "ldr x22, [x9, #0x60]\n"
- "ldr x21, [x9, #0x68]\n"
+ "ldr x23, [x13, #0x60]\n"
+ "ldr x22, [x13, #0x68]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "ldr q11, [x24, x28]\n"
- "ldr x20, [x9, #0x70]\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "ldr q12, [x23, x28]\n"
- "ldr x19, [x9, #0x78]\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x20, [x13, #0x78]\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"add x27, x27, #0x10\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q9, [x22, x28]\n"
+ "ldr q9, [x23, x28]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "ldr q10, [x21, x28]\n"
+ "ldr q10, [x22, x28]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x20, x28]\n"
+ "ldr q11, [x21, x28]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x19, x28]\n"
+ "ldr q12, [x20, x28]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
"add x28, x28, #0x10\n"
@@ -275,92 +275,92 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x13, x27]\n"
+ "str q28, [x12, x27]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x12, x27]\n"
- "str q30, [x11, x27]\n"
- "str q31, [x10, x27]\n"
+ "str q29, [x11, x27]\n"
+ "str q30, [x10, x27]\n"
+ "str q31, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 56f\n"
- "mov x27, x28\n"
- "ldr x26, [x9, #0x0]\n"
- "ldr x25, [x9, #0x8]\n"
- "ldr x24, [x9, #0x10]\n"
- "add x13, x13, x27\n"
- "add x12, x12, x27\n"
- "ldr x23, [x9, #0x18]\n"
- "ldr x22, [x9, #0x20]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
"ldr q16, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "add x26, x26, x28\n"
- "add x25, x25, x28\n"
+ "mov x27, x28\n"
+ "add x12, x12, x27\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x24, x24, x28\n"
- "add x23, x23, x28\n"
+ "add x11, x11, x27\n"
+ "add x10, x10, x27\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x22, x22, x28\n"
+ "add x9, x9, x27\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
+ "ldr x24, [x13, #0x0]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ "ldr x22, [x13, #0x10]\n"
+ "ldr x21, [x13, #0x18]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [x13, #0x20]\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[6], [x26], #0x2\n"
- "ld1 { v10.h }[6], [x25], #0x2\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
- "ld1 { v13.h }[6], [x22], #0x2\n"
+ "ld1 { v9.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v11.h }[6], [x22], #0x2\n"
+ "ld1 { v12.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 7f\n"
"4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[4], [x26], #0x2\n"
- "ld1 { v10.h }[4], [x25], #0x2\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
- "ld1 { v13.h }[4], [x22], #0x2\n"
+ "ld1 { v9.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v11.h }[4], [x22], #0x2\n"
+ "ld1 { v12.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 7f\n"
"5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.s }[0], [x26], #0x4\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[2], [x26], #0x2\n"
- "ld1 { v10.h }[2], [x25], #0x2\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
- "ld1 { v13.h }[2], [x22], #0x2\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v12.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 7f\n"
"6:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x26], #0x2\n"
- "ld1 { v10.h }[0], [x25], #0x2\n"
- "ld1 { v11.h }[0], [x24], #0x2\n"
- "ld1 { v12.h }[0], [x23], #0x2\n"
- "ld1 { v13.h }[0], [x22], #0x2\n"
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v12.h }[0], [x21], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
"mov v28.16b, v16.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x21, [x9, #0x28]\n"
- "add x21, x21, x28\n"
+ "ldr x20, [x13, #0x28]\n"
+ "add x20, x20, x28\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
@@ -370,27 +370,27 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"tbz %x[n_channels], #2, 9f\n"
- "ld1 { v9.d }[0], [x21], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v9.s }[2], [x21], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.h }[6], [x21], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 11f\n"
"8:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.h }[4], [x21], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 11f\n"
"9:" // Oddments: Load input (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v9.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 11f\n"
"10:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x21], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (3, 0): Bit 2: End
"fmla v30.8h, v6.8h, v9.8h\n"
- "ldr x20, [x9, #0x30]\n"
+ "ldr x20, [x13, #0x30]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"add x20, x20, x28\n"
"fmla v29.8h, v6.8h, v13.8h\n"
@@ -416,176 +416,176 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x19, [x9, #0x38]\n"
+ "ldr x20, [x13, #0x38]\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "add x19, x19, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: Load input (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x26, [x9, #0x40]\n"
+ "ldr x20, [x13, #0x40]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "add x26, x26, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 21f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v9.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 23f\n"
"20:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v9.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 23f\n"
"21:" // Oddments: Load input (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 23f\n"
"22:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x26], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x25, [x9, #0x48]\n"
+ "ldr x20, [x13, #0x48]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "add x25, x25, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 25f\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[6], [x25], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 27f\n"
"24:" // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[4], [x25], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 27f\n"
"25:" // Oddments: Load input (2, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 27f\n"
"26:" // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 2): Bit 2: End
- "ldr x24, [x9, #0x50]\n"
+ "ldr x20, [x13, #0x50]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x24, x24, x28\n"
+ "add x20, x20, x28\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"tbz %x[n_channels], #2, 29f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 31f\n"
"28:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 31f\n"
"29:" // Oddments: Load input (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 31f\n"
"30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x23, [x9, #0x58]\n"
+ "ldr x20, [x13, #0x58]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x23, x23, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 33f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 35f\n"
"32:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 35f\n"
"33:" // Oddments: Load input (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 35f\n"
"34:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x22, [x9, #0x60]\n"
+ "ldr x20, [x13, #0x60]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "add x22, x22, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 37f\n"
- "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v9.h }[6], [x22], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 39f\n"
"36:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v9.h }[4], [x22], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 39f\n"
"37:" // Oddments: Load input (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 39f\n"
"38:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x21, [x9, #0x68]\n"
+ "ldr x20, [x13, #0x68]\n"
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 41f\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[6], [x21], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 43f\n"
"40:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[4], [x21], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 43f\n"
"41:" // Oddments: Load input (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 43f\n"
"42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x20, [x9, #0x70]\n"
+ "ldr x20, [x13, #0x70]\n"
"fmla v29.8h, v8.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
"add x20, x20, x28\n"
@@ -609,29 +609,29 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x19, [x9, #0x78]\n"
+ "ldr x20, [x13, #0x78]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x19, x19, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 49f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 51f\n"
"48:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 51f\n"
"49:" // Oddments: Load input (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 51f\n"
"50:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (3, 2): Bit 2: End
"fmla v30.8h, v8.8h, v12.8h\n"
"fmla v31.8h, v7.8h, v12.8h\n"
@@ -644,52 +644,50 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 53f\n"
- "st1 { v28.d }[0], [x13], #0x8\n"
- "st1 { v29.d }[0], [x12], #0x8\n"
- "st1 { v30.d }[0], [x11], #0x8\n"
- "st1 { v31.d }[0], [x10], #0x8\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
- "st1 { v28.s }[2], [x13], #0x4\n"
- "st1 { v29.s }[2], [x12], #0x4\n"
- "st1 { v30.s }[2], [x11], #0x4\n"
- "st1 { v31.s }[2], [x10], #0x4\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "st1 { v28.h }[6], [x13], #0x2\n"
- "st1 { v29.h }[6], [x12], #0x2\n"
- "st1 { v30.h }[6], [x11], #0x2\n"
- "st1 { v31.h }[6], [x10], #0x2\n"
+ "st1 { v28.h }[6], [x12], #0x2\n"
+ "st1 { v29.h }[6], [x11], #0x2\n"
+ "st1 { v30.h }[6], [x10], #0x2\n"
+ "st1 { v31.h }[6], [x9], #0x2\n"
"b 55f\n"
"52:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 55f\n"
- "st1 { v28.h }[4], [x13], #0x2\n"
- "st1 { v29.h }[4], [x12], #0x2\n"
- "st1 { v30.h }[4], [x11], #0x2\n"
- "st1 { v31.h }[4], [x10], #0x2\n"
+ "st1 { v28.h }[4], [x12], #0x2\n"
+ "st1 { v29.h }[4], [x11], #0x2\n"
+ "st1 { v30.h }[4], [x10], #0x2\n"
+ "st1 { v31.h }[4], [x9], #0x2\n"
"b 55f\n"
"53:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 54f\n"
- "st1 { v28.s }[0], [x13], #0x4\n"
- "st1 { v29.s }[0], [x12], #0x4\n"
- "st1 { v30.s }[0], [x11], #0x4\n"
- "st1 { v31.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "st1 { v28.h }[2], [x13], #0x2\n"
- "st1 { v29.h }[2], [x12], #0x2\n"
- "st1 { v30.h }[2], [x11], #0x2\n"
- "st1 { v31.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x11], #0x2\n"
+ "st1 { v30.h }[2], [x10], #0x2\n"
+ "st1 { v31.h }[2], [x9], #0x2\n"
"b 55f\n"
"54:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "st1 { v28.h }[0], [x13], #0x2\n"
- "st1 { v29.h }[0], [x12], #0x2\n"
- "st1 { v30.h }[0], [x11], #0x2\n"
- "st1 { v31.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x11], #0x2\n"
+ "st1 { v30.h }[0], [x10], #0x2\n"
+ "st1 { v31.h }[0], [x9], #0x2\n"
"55:" // Oddments: Store: Bit 2: End
-
"56:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 9b4b3ee50d..2b1dc3646d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,83 +87,83 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x24, #0x0\n"
"mov x23, #0x0\n"
- "mov x22, #0x0\n"
"1:" // Tile loop
- "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x3\n"
"mov x26, #0x3\n"
- "mov x25, #0x3\n"
- "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x23, x24\n" // offset = tile_i * ld_input_row
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x23, x21\n" // offset = tile_i * ld_output_row
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "madd x20, x22, x17, x20\n" // offset += tile_j * ld_input_col
- "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
+ "mov x24, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x8, x8, #0x1\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
"lsl x17, x17, #0x1\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x19, x22, x16, x19\n" // offset += tile_j * ld_output_col
- "lsl x16, x16, #0x1\n"
- "lsr x22, %x[n_channels], #0x3\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x20, x20, x26\n" // offset *= kernel_stride * output_size
- "add x15, x15, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x12, x15, x24, LSL #1\n"
- "mul x19, x19, x25\n" // offset *= output_tile_size
- "add x11, x12, x24, LSL #1\n"
- "add x10, x17, x17\n"
- "add x14, x14, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "add x9, x11, x24, LSL #1\n"
- "add x28, x10, x17\n"
- "add x27, x14, x21, LSL #1\n"
+ "lsr x23, %x[n_channels], #0x3\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x16, x16, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x13, x16, x25, LSL #1\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x12, x13, x25, LSL #1\n"
+ "add x11, x8, x8\n"
+ "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x10, x12, x25, LSL #1\n"
+ "add x9, x11, x8\n"
+ "add x28, x15, x22, LSL #1\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
- "add x26, x9, x24, LSL #1\n"
- "add x25, x28, x17\n"
- "add x24, x27, x21, LSL #1\n"
- "add x21, x16, x16\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q16, [x13, #0x0]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q0, [x13, #0x10]\n"
- "ldr q1, [x13, #0x20]\n"
- "ldr q2, [x13, #0x30]\n"
- "ldr q3, [x13, #0x40]\n"
- "ldr q4, [x13, #0x50]\n"
- "ldr q5, [x13, #0x60]\n"
- "ldr q6, [x13, #0x70]\n"
- "ldr q7, [x13, #0x80]\n"
- "ldr q8, [x13, #0x90]\n"
- "ldr q9, [x11, x10]\n"
- "add x13, x13, #0xa0\n"
- "ld1 { v10.8h }, [x15]\n"
- "ldr q11, [x15, x25]\n"
- "ld1 { v12.8h }, [x26]\n"
- "ldr q13, [x12, x10]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x27, x10, x25, LSL #1\n"
+ "add x26, x9, x8\n"
+ "add x25, x28, x22, LSL #1\n"
+ "add x22, x17, x17\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x24\n"
+ "cbz x23, 4f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x24, x23, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x12, x11]\n"
+ "ld1 { v10.8h }, [x16]\n"
+ "ldr q11, [x16, x26]\n"
+ "ld1 { v12.8h }, [x27]\n"
+ "ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
"mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "add x23, x23, #0x10\n"
- "cmp x23, x22, LSL #4\n"
+ "add x24, x24, #0x10\n"
+ "cmp x24, x23, LSL #4\n"
"mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
- "add x19, x19, #0x10\n"
"add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
"mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x11, x28]\n"
+ "ldr q10, [x12, x9]\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x11, x17]\n"
+ "ldr q11, [x12, x8]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
@@ -172,139 +172,139 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x15, x17]\n"
+ "ldr q13, [x16, x8]\n"
"fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x26, x25]\n"
+ "ldr q12, [x27, x26]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q16, [x14, #0x0]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "ldr q16, [x13, #0x0]\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x15, x28]\n"
+ "ldr q12, [x16, x9]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
+ "ld1 { v11.8h }, [x13]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x9]\n"
+ "ld1 { v12.8h }, [x10]\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x12, x25]\n"
+ "ldr q13, [x13, x26]\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
"fmla v27.8h, v5.8h, v10.8h\n"
+ "ldr q10, [x10, x11]\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "ldr q10, [x9, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
"fmla v23.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q11, [x9, x25]\n"
- "ldr q13, [x26, x17]\n"
+ "ldr q13, [x27, x8]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x13, x8]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q12, [x12, x17]\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x26, x28]\n"
+ "ldr q13, [x27, x9]\n"
"fmla v24.8h, v3.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
+ "ldr q11, [x13, x9]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
- "add x12, x12, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x9]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
- "ldr q13, [x9, x28]\n"
"fmla v26.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x10, x8]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
- "ldr q12, [x9, x17]\n"
- "add x9, x9, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v25.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x15, x10]\n"
+ "ldr q11, [x16, x11]\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
+ "ld1 { v10.8h }, [x16]\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "ld1 { v10.8h }, [x15]\n"
- "ldr q4, [x13, #0x50]\n"
+ "ldr q4, [x14, #0x50]\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x11]\n"
+ "ld1 { v12.8h }, [x12]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q1, [x14, #0x20]\n"
"fmax v24.8h, v24.8h, v18.8h\n"
- "ldr q1, [x13, #0x20]\n"
"fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x25]\n"
+ "ldr q11, [x12, x26]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x11, x11, #0x10\n"
+ "add x12, x12, #0x10\n"
+ "ldr q9, [x12, x11]\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
- "ldr q9, [x11, x10]\n"
+ "ldr q0, [x14, #0x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x14, #0x30]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr q13, [x27, x11]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x14, #0x40]\n"
"fmax v23.8h, v23.8h, v18.8h\n"
- "add x26, x26, #0x10\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "ldr q11, [x15, x25]\n"
+ "ldr q11, [x16, x26]\n"
+ "ldr q5, [x14, #0x60]\n"
"fmla v29.8h, v8.8h, v13.8h\n"
+ "ldr q8, [x14, #0x90]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
- "ld1 { v12.8h }, [x26]\n"
+ "ldr q7, [x14, #0x80]\n"
"fmla v31.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x13, x11]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "add x27, x27, #0x10\n"
+ "ld1 { v12.8h }, [x27]\n"
"fmax v28.8h, v28.8h, v18.8h\n"
- "ldr q13, [x12, x10]\n"
- "ldr q0, [x13, #0x10]\n"
"fmax v29.8h, v29.8h, v18.8h\n"
+ "add x14, x14, #0xa0\n"
"fmax v30.8h, v30.8h, v18.8h\n"
- "ldr q2, [x13, #0x30]\n"
- "ldr q3, [x13, #0x40]\n"
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v23.8h, v23.8h, v17.8h\n"
- "st1 { v23.8h }, [x14]\n"
- "ldr q5, [x13, #0x60]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "st1 { v23.8h }, [x15]\n"
"fmin v25.8h, v25.8h, v17.8h\n"
"fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x14, x16]\n"
- "ldr q6, [x13, #0x70]\n"
+ "str q24, [x15, x17]\n"
"fmin v27.8h, v27.8h, v17.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x14, x21]\n"
- "add x14, x14, #0x10\n"
+ "str q25, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
"fmin v29.8h, v29.8h, v17.8h\n"
"fmin v30.8h, v30.8h, v17.8h\n"
- "st1 { v26.8h }, [x27]\n"
- "ldr q7, [x13, #0x80]\n"
+ "st1 { v26.8h }, [x28]\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x27, x16]\n"
- "ldr q8, [x13, #0x90]\n"
- "add x13, x13, #0xa0\n"
- "str q28, [x27, x21]\n"
- "add x27, x27, #0x10\n"
- "st1 { v29.8h }, [x24]\n"
- "str q30, [x24, x16]\n"
- "str q31, [x24, x21]\n"
- "add x24, x24, #0x10\n"
+ "str q27, [x28, x17]\n"
+ "str q28, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v29.8h }, [x25]\n"
+ "str q30, [x25, x17]\n"
+ "str q31, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
@@ -315,9 +315,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
- "ldr q10, [x11, x28]\n"
+ "ldr q10, [x12, x9]\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x11, x17]\n"
+ "ldr q11, [x12, x8]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
@@ -326,92 +326,92 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x15, x17]\n"
+ "ldr q13, [x16, x8]\n"
"fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x26, x25]\n"
+ "ldr q12, [x27, x26]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x15, x28]\n"
+ "ldr q12, [x16, x9]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x12]\n"
+ "ld1 { v11.8h }, [x13]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x9]\n"
+ "ld1 { v12.8h }, [x10]\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q13, [x12, x25]\n"
+ "ldr q13, [x13, x26]\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
"fmla v27.8h, v5.8h, v10.8h\n"
+ "ldr q10, [x10, x11]\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "ldr q10, [x9, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
"fmla v23.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q11, [x9, x25]\n"
- "ldr q13, [x26, x17]\n"
+ "ldr q13, [x27, x8]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x13, x8]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q12, [x12, x17]\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x26, x28]\n"
+ "ldr q13, [x27, x9]\n"
"fmla v24.8h, v3.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"fmla v28.8h, v8.8h, v11.8h\n"
- "ldr q11, [x12, x28]\n"
+ "ldr q11, [x13, x9]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
- "add x12, x12, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x9]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
- "ldr q13, [x9, x28]\n"
"fmla v26.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x10, x8]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
- "ldr q12, [x9, x17]\n"
- "add x9, x9, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmla v25.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
- "ldr q11, [x15, x10]\n"
+ "ldr q11, [x16, x11]\n"
"fmla v29.8h, v4.8h, v12.8h\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "ld1 { v12.8h }, [x11]\n"
+ "ld1 { v12.8h }, [x12]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
"fmax v24.8h, v24.8h, v18.8h\n"
"fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x25]\n"
+ "ldr q11, [x12, x26]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmin v24.8h, v24.8h, v17.8h\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "add x11, x11, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v27.8h, v8.8h, v13.8h\n"
- "ldr q13, [x26, x10]\n"
+ "ldr q13, [x27, x11]\n"
"fmax v27.8h, v27.8h, v18.8h\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
"fmax v23.8h, v23.8h, v18.8h\n"
- "add x26, x26, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmax v25.8h, v25.8h, v18.8h\n"
@@ -424,94 +424,94 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v23.8h, v23.8h, v17.8h\n"
- "st1 { v23.8h }, [x14]\n"
+ "st1 { v23.8h }, [x15]\n"
"fmin v25.8h, v25.8h, v17.8h\n"
"fmin v26.8h, v26.8h, v17.8h\n"
- "str q24, [x14, x16]\n"
+ "str q24, [x15, x17]\n"
"fmin v27.8h, v27.8h, v17.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
- "str q25, [x14, x21]\n"
- "add x14, x14, #0x10\n"
+ "str q25, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
"fmin v29.8h, v29.8h, v17.8h\n"
"fmin v30.8h, v30.8h, v17.8h\n"
- "st1 { v26.8h }, [x27]\n"
+ "st1 { v26.8h }, [x28]\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q27, [x27, x16]\n"
- "str q28, [x27, x21]\n"
- "add x27, x27, #0x10\n"
- "st1 { v29.8h }, [x24]\n"
- "str q30, [x24, x16]\n"
- "str q31, [x24, x21]\n"
- "add x24, x24, #0x10\n"
+ "str q27, [x28, x17]\n"
+ "str q28, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v29.8h }, [x25]\n"
+ "str q30, [x25, x17]\n"
+ "str q31, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 93f\n"
- "ldr q16, [x13, #0x0]\n"
- "ldr q0, [x13, #0x10]\n"
- "ldr q1, [x13, #0x20]\n"
- "ldr q2, [x13, #0x30]\n"
- "add x23, x11, x10\n"
- "add x22, x15, XZR\n"
- "ldr q3, [x13, #0x40]\n"
- "ldr q4, [x13, #0x50]\n"
- "add x21, x15, x25\n"
- "add x20, x26, XZR\n"
- "ldr q5, [x13, #0x60]\n"
- "ldr q6, [x13, #0x70]\n"
- "add x19, x12, x10\n"
- "ldr q7, [x13, #0x80]\n"
- "ldr q8, [x13, #0x90]\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x24, x12, x11\n"
+ "add x23, x16, XZR\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x22, x16, x26\n"
+ "add x21, x27, XZR\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x20, x13, x11\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
- "ldr d9, [x23], #0x8\n"
- "ldr d10, [x22], #0x8\n"
- "ldr d11, [x21], #0x8\n"
- "ldr d12, [x20], #0x8\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v9.s }[2], [x23], #0x4\n"
- "ld1 { v10.s }[2], [x22], #0x4\n"
- "ld1 { v11.s }[2], [x21], #0x4\n"
- "ld1 { v12.s }[2], [x20], #0x4\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[6], [x23]\n"
- "ld1 { v10.h }[6], [x22]\n"
- "ld1 { v11.h }[6], [x21]\n"
- "ld1 { v12.h }[6], [x20]\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x24]\n"
+ "ld1 { v10.h }[6], [x23]\n"
+ "ld1 { v11.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x21]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 8f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[4], [x23]\n"
- "ld1 { v10.h }[4], [x22]\n"
- "ld1 { v11.h }[4], [x21]\n"
- "ld1 { v12.h }[4], [x20]\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x24]\n"
+ "ld1 { v10.h }[4], [x23]\n"
+ "ld1 { v11.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x21]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 8f\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 7f\n"
- "ldr s9, [x23], #0x4\n"
- "ldr s10, [x22], #0x4\n"
- "ldr s11, [x21], #0x4\n"
- "ldr s12, [x20], #0x4\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s10, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[2], [x23]\n"
- "ld1 { v10.h }[2], [x22]\n"
- "ld1 { v11.h }[2], [x21]\n"
- "ld1 { v12.h }[2], [x20]\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v10.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x23, #0x0]\n"
- "ldr h10, [x22, #0x0]\n"
- "ldr h11, [x21, #0x0]\n"
- "ldr h12, [x20, #0x0]\n"
- "ldr h13, [x19, #0x0]\n"
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h10, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
"mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "add x19, x26, x25\n"
+ "add x20, x27, x26\n"
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
"mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
@@ -529,483 +529,483 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmla v27.8h, v1.8h, v13.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
"tbz %x[n_channels], #2, 10f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 12f\n"
"9:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 12f\n"
"10:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 11f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v12.8h\n"
- "add x19, x11, x17\n"
+ "add x20, x12, x8\n"
"tbz %x[n_channels], #2, 14f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 16f\n"
"13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 16f\n"
"14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 15f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x19, x15, x17\n"
+ "add x20, x16, x8\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 18f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 20f\n"
"17:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 20f\n"
"18:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 19f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v23.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "add x19, x15, x28\n"
+ "add x20, x16, x9\n"
"tbz %x[n_channels], #2, 22f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 24f\n"
"21:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 24f\n"
"22:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "add x19, x11, x28\n"
+ "add x20, x12, x9\n"
"tbz %x[n_channels], #2, 26f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 28f\n"
"25:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 28f\n"
"26:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 27f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x19, x12, XZR\n"
+ "add x20, x13, XZR\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"tbz %x[n_channels], #2, 30f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 32f\n"
"29:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 32f\n"
"30:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 31f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v23.8h, v3.8h, v11.8h\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "add x19, x12, x25\n"
+ "add x20, x13, x26\n"
"tbz %x[n_channels], #2, 34f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 36f\n"
"33:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 36f\n"
"34:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 35f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v25.8h, v5.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x19, x9, XZR\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #2, 38f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 40f\n"
"37:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 40f\n"
"38:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 39f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v26.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "add x19, x9, x10\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #2, 42f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 44f\n"
"41:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 44f\n"
"42:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 43f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x19, x9, x25\n"
+ "add x20, x10, x26\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
"tbz %x[n_channels], #2, 46f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 48f\n"
"45:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 48f\n"
"46:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 47f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x19, x26, x17\n"
+ "add x20, x27, x8\n"
"tbz %x[n_channels], #2, 50f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 52f\n"
"49:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 52f\n"
"50:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 51f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v29.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
- "add x19, x12, x17\n"
+ "add x20, x13, x8\n"
"tbz %x[n_channels], #2, 54f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 56f\n"
"53:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 56f\n"
"54:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 55f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x19, x12, x28\n"
+ "add x20, x13, x9\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 58f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 60f\n"
"57:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 60f\n"
"58:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 59f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x19, x26, x28\n"
+ "add x20, x27, x9\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 62f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 64f\n"
"61:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 64f\n"
"62:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 63f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 64f\n"
"63:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v30.8h, v8.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "add x19, x9, x17\n"
+ "add x20, x10, x8\n"
"tbz %x[n_channels], #2, 66f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 68f\n"
"65:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 68f\n"
"66:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 67f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 68f\n"
"67:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x19, x15, x10\n"
+ "add x20, x16, x11\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 70f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 72f\n"
"69:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 72f\n"
"70:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 71f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"72:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x19, x9, x28\n"
+ "add x20, x10, x9\n"
"fmla v25.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 74f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 76f\n"
"73:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 76f\n"
"74:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 75f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 76f\n"
"75:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x19, x11, XZR\n"
+ "add x20, x12, XZR\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 78f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 77f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 80f\n"
"77:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 80f\n"
"78:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 79f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 80f\n"
"79:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"80:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x19, x11, x25\n"
+ "add x20, x12, x26\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 82f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 81f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 84f\n"
"81:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 84f\n"
"82:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 83f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 84f\n"
"83:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"84:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x19, x26, x10\n"
+ "add x20, x27, x11\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"tbz %x[n_channels], #2, 86f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 85f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 88f\n"
"85:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 88f\n"
"86:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 87f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 88f\n"
"87:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"88:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
@@ -1029,127 +1029,127 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 90f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "mov x19, x24\n"
- "st1 { v23.d }[0], [x21], x16\n"
- "st1 { v26.d }[0], [x20], x16\n"
- "add x14, x14, #0x8\n"
- "add x27, x27, #0x8\n"
- "st1 { v29.d }[0], [x19], x16\n"
- "add x24, x24, #0x8\n"
- "st1 { v24.d }[0], [x21], x16\n"
- "st1 { v27.d }[0], [x20], x16\n"
- "st1 { v30.d }[0], [x19], x16\n"
- "st1 { v25.d }[0], [x21]\n"
- "st1 { v28.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.d }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.d }[0], [x21], x17\n"
+ "add x15, x15, #0x8\n"
+ "st1 { v29.d }[0], [x20], x17\n"
+ "add x28, x28, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v24.d }[0], [x22], x17\n"
+ "st1 { v27.d }[0], [x21], x17\n"
+ "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "st1 { v28.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 89f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "mov x19, x24\n"
- "st1 { v23.s }[2], [x21], x16\n"
- "add x14, x14, #0x4\n"
- "st1 { v26.s }[2], [x20], x16\n"
- "add x27, x27, #0x4\n"
- "add x24, x24, #0x4\n"
- "st1 { v29.s }[2], [x19], x16\n"
- "st1 { v24.s }[2], [x21], x16\n"
- "st1 { v27.s }[2], [x20], x16\n"
- "st1 { v30.s }[2], [x19], x16\n"
- "st1 { v25.s }[2], [x21]\n"
- "st1 { v28.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[2], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[2], [x21], x17\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v29.s }[2], [x20], x17\n"
+ "add x28, x28, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v24.s }[2], [x22], x17\n"
+ "st1 { v27.s }[2], [x21], x17\n"
+ "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 92f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "mov x19, x24\n"
- "st1 { v23.h }[6], [x21], x16\n"
- "st1 { v26.h }[6], [x20], x16\n"
- "st1 { v29.h }[6], [x19], x16\n"
- "st1 { v24.h }[6], [x21], x16\n"
- "st1 { v27.h }[6], [x20], x16\n"
- "st1 { v30.h }[6], [x19], x16\n"
- "st1 { v25.h }[6], [x21]\n"
- "st1 { v28.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[6], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[6], [x21], x17\n"
+ "st1 { v29.h }[6], [x20], x17\n"
+ "st1 { v24.h }[6], [x22], x17\n"
+ "st1 { v27.h }[6], [x21], x17\n"
+ "st1 { v30.h }[6], [x20], x17\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 92f\n"
"89:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 92f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "st1 { v23.h }[4], [x21], x16\n"
- "mov x19, x24\n"
- "st1 { v26.h }[4], [x20], x16\n"
- "st1 { v29.h }[4], [x19], x16\n"
- "st1 { v24.h }[4], [x21], x16\n"
- "st1 { v27.h }[4], [x20], x16\n"
- "st1 { v30.h }[4], [x19], x16\n"
- "st1 { v25.h }[4], [x21]\n"
- "st1 { v28.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[4], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[4], [x21], x17\n"
+ "st1 { v29.h }[4], [x20], x17\n"
+ "st1 { v24.h }[4], [x22], x17\n"
+ "st1 { v27.h }[4], [x21], x17\n"
+ "st1 { v30.h }[4], [x20], x17\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 92f\n"
"90:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 91f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "st1 { v23.s }[0], [x21], x16\n"
- "mov x19, x24\n"
- "st1 { v26.s }[0], [x20], x16\n"
- "add x14, x14, #0x4\n"
- "st1 { v29.s }[0], [x19], x16\n"
- "add x27, x27, #0x4\n"
- "add x24, x24, #0x4\n"
- "st1 { v24.s }[0], [x21], x16\n"
- "st1 { v27.s }[0], [x20], x16\n"
- "st1 { v30.s }[0], [x19], x16\n"
- "st1 { v25.s }[0], [x21]\n"
- "st1 { v28.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v29.s }[0], [x20], x17\n"
+ "add x28, x28, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v24.s }[0], [x22], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "st1 { v28.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 92f\n"
- "mov x21, x14\n"
- "mov x20, x27\n"
- "mov x19, x24\n"
- "st1 { v23.h }[2], [x21], x16\n"
- "st1 { v26.h }[2], [x20], x16\n"
- "st1 { v29.h }[2], [x19], x16\n"
- "st1 { v24.h }[2], [x21], x16\n"
- "st1 { v27.h }[2], [x20], x16\n"
- "st1 { v30.h }[2], [x19], x16\n"
- "st1 { v25.h }[2], [x21]\n"
- "st1 { v28.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[2], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[2], [x21], x17\n"
+ "st1 { v29.h }[2], [x20], x17\n"
+ "st1 { v24.h }[2], [x22], x17\n"
+ "st1 { v27.h }[2], [x21], x17\n"
+ "st1 { v30.h }[2], [x20], x17\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 92f\n"
"91:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x21, x14\n"
- "mov x20, x27\n"
- "st1 { v23.h }[0], [x21], x16\n"
- "mov x19, x24\n"
- "st1 { v26.h }[0], [x20], x16\n"
- "st1 { v29.h }[0], [x19], x16\n"
- "st1 { v24.h }[0], [x21], x16\n"
- "st1 { v27.h }[0], [x20], x16\n"
- "st1 { v30.h }[0], [x19], x16\n"
- "st1 { v25.h }[0], [x21]\n"
- "st1 { v28.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[0], [x21], x17\n"
+ "st1 { v29.h }[0], [x20], x17\n"
+ "st1 { v24.h }[0], [x22], x17\n"
+ "st1 { v27.h }[0], [x21], x17\n"
+ "st1 { v30.h }[0], [x20], x17\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "st1 { v28.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"92:" // Tile loop: Oddments: Store: Bit 2: End
"93:" // Tile loop: End
- "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x22, x22, #0x1\n"
- "add x20, x23, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x22, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x23, x23, x20, LT\n"
- "csel x22, x22, XZR, LT\n"
- "cmp x23, x19\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x23, x23, #0x1\n"
+ "add x21, x24, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x23, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x24, x24, x21, LT\n"
+ "csel x23, x23, XZR, LT\n"
+ "cmp x24, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index faf6c91181..878aa29bcf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,457 +87,457 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x17, #0x10\n" // cntb _, ALL, #1
- "lsr x16, %x[n_channels], #0x3\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x3\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x12, #0x0\n"
- "sub x11, XZR, x17\n"
- "cbz x16, 3f\n"
- "ldp x10, x9, [x13, #0x0]\n"
- "ldp x28, x27, [x13, #0x10]\n"
- "ldr x26, [x13, #0x20]\n"
- "cmp x17, x16, LSL #4\n"
- "ldr q16, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "ldr q9, [x10, x12]\n"
- "add x14, x14, #0xa0\n"
- "ldr q10, [x9, x12]\n"
- "ldr q11, [x28, x12]\n"
- "ldr q12, [x27, x12]\n"
- "ldr q13, [x26, x12]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "sub x12, XZR, x8\n"
+ "cbz x17, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x8, x17, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x13]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr q13, [x27, x13]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x25, [x13, #0x30]\n"
- "ldr x24, [x13, #0x38]\n"
+ "ldr x26, [x14, #0x30]\n"
+ "ldr x25, [x14, #0x38]\n"
"mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x23, [x13, #0x28]\n"
- "ldr x9, [x13, #0x48]\n"
+ "ldr x24, [x14, #0x28]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "ldr q10, [x10, x13]\n"
"fmla v24.8h, v4.8h, v13.8h\n"
"mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x10, [x13, #0x40]\n"
- "ldr q10, [x9, x12]\n"
+ "ldr x11, [x14, #0x40]\n"
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x28, [x13, #0x50]\n"
- "ldr x27, [x13, #0x58]\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
+ "ldr q11, [x26, x13]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x26, [x13, #0x60]\n"
+ "ldr x27, [x14, #0x60]\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x25, [x13, #0x70]\n"
- "ldr x9, [x13, #0x88]\n"
+ "ldr x26, [x14, #0x70]\n"
+ "ldr x10, [x14, #0x88]\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x22, [x15, #0x0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x12, x12, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
"fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x24, x12]\n"
+ "ldr q13, [x25, x13]\n"
"fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
+ "ldr q12, [x24, x13]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q16, [x15, #0x0]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x23, [x13, #0x68]\n"
- "ldr x24, [x13, #0x78]\n"
+ "ldr x24, [x14, #0x68]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x10, x12]\n"
- "ldr x10, [x13, #0x80]\n"
+ "ldr q12, [x11, x13]\n"
+ "ldr x25, [x14, #0x78]\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr q16, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x18]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x9, x13]\n"
"fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q11, [x28, x12]\n"
- "ldr q13, [x27, x12]\n"
+ "ldr q13, [x28, x13]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x26, x12]\n"
- "ldr x28, [x13, #0x90]\n"
+ "ldr q12, [x27, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x26, [x13, #0xa0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "ldr x28, [x14, #0x98]\n"
"fmla v26.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
+ "ldr q10, [x24, x13]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q10, [x23, x12]\n"
- "ldr x23, [x13, #0xa8]\n"
+ "ldr x24, [x14, #0xa8]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x11, x13]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q12, [x10, x12]\n"
- "ldr x10, [x13, #0xc0]\n"
+ "ldr x11, [x14, #0xc0]\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v23.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x26, x13]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q11, [x25, x12]\n"
- "ldr q13, [x24, x12]\n"
+ "ldr q13, [x25, x13]\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x25, [x13, #0xb0]\n"
- "ldr x24, [x13, #0xb8]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "ldr x25, [x14, #0xb8]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x9, x12]\n"
+ "ldr q11, [x10, x13]\n"
"fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x28, x12]\n"
+ "ldr q13, [x9, x13]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x28, x13]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
- "ldr q12, [x27, x12]\n"
"fmla v25.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
- "ldr q11, [x26, x12]\n"
- "ldr x26, [x13, #0x20]\n"
+ "ldr x27, [x14, #0x20]\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x23, x12]\n"
+ "ldr q13, [x24, x13]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x25, x12]\n"
+ "ldr q12, [x26, x13]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q4, [x15, #0x50]\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "fmax v24.8h, v24.8h, v18.8h\n"
- "ldr q1, [x14, #0x20]\n"
+ "ldr q1, [x15, #0x20]\n"
"fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmax v23.8h, v23.8h, v18.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x22, x11]\n"
+ "str q23, [x23, x12]\n"
"fmla v29.8h, v0.8h, v12.8h\n"
+ "ldr q0, [x15, #0x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr x22, [x15, #0x20]\n"
- "fmin v24.8h, v24.8h, v17.8h\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr q13, [x11, x13]\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "ldr q13, [x10, x12]\n"
- "fmax v26.8h, v26.8h, v18.8h\n"
+ "ldr q3, [x15, #0x40]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v18.8h\n"
- "ldp x10, x9, [x13, #0x0]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
"fmla v29.8h, v8.8h, v13.8h\n"
+ "ldr q8, [x15, #0x90]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v18.8h\n"
- "ldp x28, x27, [x13, #0x10]\n"
+ "ldr q7, [x15, #0x80]\n"
"fmla v31.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x27, x8]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr x23, [x16, #0x20]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x8]\n"
+ "ldr q10, [x10, x8]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
"fmin v25.8h, v25.8h, v17.8h\n"
- "str q24, [x21, x11]\n"
- "ldr x21, [x15, #0x28]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x8]\n"
"fmin v26.8h, v26.8h, v17.8h\n"
"fmin v27.8h, v27.8h, v17.8h\n"
- "str q25, [x20, x11]\n"
- "ldr x20, [x15, #0x30]\n"
+ "ldr q12, [x28, x8]\n"
"fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "str q26, [x19, x11]\n"
- "ldr x19, [x15, #0x38]\n"
+ "str q24, [x22, x12]\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "str q27, [x22, x11]\n"
- "ldr x22, [x15, #0x40]\n"
- "ldr q9, [x10, x17]\n"
- "ldr q10, [x9, x17]\n"
+ "str q25, [x21, x12]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "str q26, [x20, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x8, x8, #0x10\n"
+ "str q27, [x23, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "cmp x8, x17, LSL #4\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "ldr q11, [x28, x17]\n"
- "ldr q12, [x27, x17]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "str q28, [x22, x12]\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "ldr q13, [x26, x17]\n"
- "add x17, x17, #0x10\n"
- "cmp x17, x16, LSL #4\n"
- "str q28, [x21, x11]\n"
- "add x12, x12, #0x10\n"
- "str q29, [x20, x11]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q2, [x14, #0x30]\n"
- "str q30, [x19, x11]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "str q31, [x22, x11]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
+ "str q29, [x21, x12]\n"
+ "add x15, x15, #0xa0\n"
+ "str q30, [x20, x12]\n"
+ "str q31, [x23, x12]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
- "ldr x25, [x13, #0x30]\n"
- "ldr x24, [x13, #0x38]\n"
+ "ldr x26, [x14, #0x30]\n"
+ "ldr x25, [x14, #0x38]\n"
"mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
- "ldr x23, [x13, #0x28]\n"
- "ldr x9, [x13, #0x48]\n"
+ "ldr x24, [x14, #0x28]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "ldr q10, [x10, x13]\n"
"fmla v24.8h, v4.8h, v13.8h\n"
"mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x10, [x13, #0x40]\n"
- "ldr q10, [x9, x12]\n"
+ "ldr x11, [x14, #0x40]\n"
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v16.16b\n fmla v28.8h, v3.8h, v9.8h\n"
- "ldr x28, [x13, #0x50]\n"
- "ldr x27, [x13, #0x58]\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
+ "ldr q11, [x26, x13]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v2.8h, v9.8h\n"
- "ldr x26, [x13, #0x60]\n"
+ "ldr x27, [x14, #0x60]\n"
"fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "ldr x25, [x13, #0x70]\n"
- "ldr x9, [x13, #0x88]\n"
+ "ldr x26, [x14, #0x70]\n"
+ "ldr x10, [x14, #0x88]\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v25.8h, v3.8h, v13.8h\n"
- "ldr x22, [x15, #0x0]\n"
- "add x11, x11, #0x10\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x12, x12, #0x10\n"
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
"fmla v28.8h, v0.8h, v13.8h\n"
- "ldr q13, [x24, x12]\n"
+ "ldr q13, [x25, x13]\n"
"fmla v29.8h, v6.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
+ "ldr q12, [x24, x13]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "ldr x23, [x13, #0x68]\n"
- "ldr x24, [x13, #0x78]\n"
+ "ldr x24, [x14, #0x68]\n"
+ "ldr x25, [x14, #0x78]\n"
"fmla v24.8h, v0.8h, v13.8h\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "ldr q12, [x10, x12]\n"
- "ldr x10, [x13, #0x80]\n"
+ "ldr q12, [x11, x13]\n"
+ "ldr x11, [x14, #0x80]\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
- "ldr x19, [x15, #0x18]\n"
+ "ldr x20, [x16, #0x18]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x9, x13]\n"
"fmla v23.8h, v1.8h, v13.8h\n"
- "ldr q11, [x28, x12]\n"
- "ldr q13, [x27, x12]\n"
+ "ldr q13, [x28, x13]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "ldr q12, [x26, x12]\n"
- "ldr x28, [x13, #0x90]\n"
+ "ldr q12, [x27, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "ldr x26, [x13, #0xa0]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "ldr x28, [x14, #0x98]\n"
"fmla v26.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
+ "ldr q10, [x24, x13]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q10, [x23, x12]\n"
- "ldr x23, [x13, #0xa8]\n"
+ "ldr x24, [x14, #0xa8]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x11, x13]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "ldr q12, [x10, x12]\n"
- "ldr x10, [x13, #0xc0]\n"
+ "ldr x11, [x14, #0xc0]\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v23.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x26, x13]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
- "ldr q11, [x25, x12]\n"
- "ldr q13, [x24, x12]\n"
+ "ldr q13, [x25, x13]\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
- "ldr x25, [x13, #0xb0]\n"
- "ldr x24, [x13, #0xb8]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "ldr x25, [x14, #0xb8]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x9, x12]\n"
+ "ldr q11, [x10, x13]\n"
"fmla v29.8h, v7.8h, v13.8h\n"
- "ldr q13, [x28, x12]\n"
+ "ldr q13, [x9, x13]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x28, x13]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
- "ldr q12, [x27, x12]\n"
"fmla v25.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
- "ldr q11, [x26, x12]\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "ldr q13, [x23, x12]\n"
+ "ldr q13, [x24, x13]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
- "ldr q12, [x25, x12]\n"
+ "ldr q12, [x26, x13]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
"fmax v24.8h, v24.8h, v18.8h\n"
"fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmax v23.8h, v23.8h, v18.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmin v23.8h, v23.8h, v17.8h\n"
- "str q23, [x22, x11]\n"
+ "str q23, [x23, x12]\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr x22, [x15, #0x20]\n"
+ "ldr x23, [x16, #0x20]\n"
"fmin v24.8h, v24.8h, v17.8h\n"
"fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr q13, [x11, x13]\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "ldr q13, [x10, x12]\n"
"fmax v26.8h, v26.8h, v18.8h\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmax v25.8h, v25.8h, v18.8h\n"
- "str q24, [x21, x11]\n"
+ "str q24, [x22, x12]\n"
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
"fmax v27.8h, v27.8h, v18.8h\n"
- "ldr x21, [x15, #0x28]\n"
+ "ldr x22, [x16, #0x28]\n"
"fmla v31.8h, v6.8h, v13.8h\n"
"fmin v25.8h, v25.8h, v17.8h\n"
- "str q25, [x20, x11]\n"
- "ldr x20, [x15, #0x30]\n"
+ "str q25, [x21, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
"fmin v26.8h, v26.8h, v17.8h\n"
"fmin v27.8h, v27.8h, v17.8h\n"
- "str q26, [x19, x11]\n"
- "ldr x19, [x15, #0x38]\n"
+ "str q26, [x20, x12]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "str q27, [x22, x11]\n"
- "ldr x22, [x15, #0x40]\n"
+ "str q27, [x23, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "add x12, x12, #0x10\n"
+ "add x13, x13, #0x10\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x21, x11]\n"
+ "str q28, [x22, x12]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x20, x11]\n"
- "str q30, [x19, x11]\n"
- "str q31, [x22, x11]\n"
+ "str q29, [x21, x12]\n"
+ "str q30, [x20, x12]\n"
+ "str q31, [x23, x12]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 92f\n"
- "ldr x10, [x13, #0x0]\n"
- "ldr x9, [x13, #0x8]\n"
- "ldr x28, [x13, #0x10]\n"
- "ldr x27, [x13, #0x18]\n"
- "mov x11, x12\n"
- "add x10, x10, x12\n"
- "ldr x26, [x13, #0x20]\n"
- "ldr q16, [x14, #0x0]\n"
- "add x9, x9, x12\n"
- "add x28, x28, x12\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "add x27, x27, x12\n"
- "add x26, x26, x12\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x13\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x24, [x14, #0x0]\n"
+ "ldr x23, [x14, #0x8]\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "ldr x22, [x14, #0x10]\n"
+ "ldr x21, [x14, #0x18]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.d }[0], [x10], #0x8\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
- "ld1 { v13.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.s }[2], [x10], #0x4\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[6], [x10], #0x2\n"
- "ld1 { v10.h }[6], [x9], #0x2\n"
- "ld1 { v11.h }[6], [x28], #0x2\n"
- "ld1 { v12.h }[6], [x27], #0x2\n"
- "ld1 { v13.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v11.h }[6], [x22], #0x2\n"
+ "ld1 { v12.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 7f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[4], [x10], #0x2\n"
- "ld1 { v10.h }[4], [x9], #0x2\n"
- "ld1 { v11.h }[4], [x28], #0x2\n"
- "ld1 { v12.h }[4], [x27], #0x2\n"
- "ld1 { v13.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v11.h }[4], [x22], #0x2\n"
+ "ld1 { v12.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 7f\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.s }[0], [x10], #0x4\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
- "ld1 { v13.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[2], [x10], #0x2\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
- "ld1 { v11.h }[2], [x28], #0x2\n"
- "ld1 { v12.h }[2], [x27], #0x2\n"
- "ld1 { v13.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v12.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 7f\n"
"6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x10], #0x2\n"
- "ld1 { v10.h }[0], [x9], #0x2\n"
- "ld1 { v11.h }[0], [x28], #0x2\n"
- "ld1 { v12.h }[0], [x27], #0x2\n"
- "ld1 { v13.h }[0], [x26], #0x2\n"
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v12.h }[0], [x21], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
"mov v23.16b, v16.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v25.16b, v16.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "ldr x23, [x13, #0x28]\n"
- "add x23, x23, x12\n"
+ "ldr x20, [x14, #0x28]\n"
+ "add x20, x20, x13\n"
"mov v24.16b, v16.16b\n fmla v24.8h, v7.8h, v9.8h\n"
"mov v26.16b, v16.16b\n fmla v26.8h, v5.8h, v9.8h\n"
"mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
@@ -555,502 +555,502 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"fmla v27.8h, v1.8h, v13.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
"tbz %x[n_channels], #2, 9f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 11f\n"
"8:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 11f\n"
"9:" // Oddments: Load input (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 11f\n"
"10:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (4, 4): Bit 2: End
- "ldr x25, [x13, #0x30]\n"
+ "ldr x20, [x14, #0x30]\n"
"fmla v31.8h, v8.8h, v12.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 15f\n"
"12:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 15f\n"
"13:" // Oddments: Load input (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 15f\n"
"14:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x24, [x13, #0x38]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[6], [x24], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[4], [x24], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: Load input (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[2], [x24], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x24], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x10, [x13, #0x40]\n"
+ "ldr x20, [x14, #0x40]\n"
"fmla v23.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 21f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v12.s }[2], [x10], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[6], [x10], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 23f\n"
"20:" // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[4], [x10], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 23f\n"
"21:" // Oddments: Load input (0, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v12.s }[0], [x10], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[2], [x10], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 23f\n"
"22:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x10], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x9, [x13, #0x48]\n"
+ "ldr x20, [x14, #0x48]\n"
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 25f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[6], [x9], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 27f\n"
"24:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[4], [x9], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 27f\n"
"25:" // Oddments: Load input (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 27f\n"
"26:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x28, [x13, #0x50]\n"
+ "ldr x20, [x14, #0x50]\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"tbz %x[n_channels], #2, 29f\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[6], [x28], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 31f\n"
"28:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[4], [x28], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 31f\n"
"29:" // Oddments: Load input (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 31f\n"
"30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x27, [x13, #0x58]\n"
+ "ldr x20, [x14, #0x58]\n"
"fmla v23.8h, v3.8h, v11.8h\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 33f\n"
- "ld1 { v13.d }[0], [x27], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v13.s }[2], [x27], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[6], [x27], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 35f\n"
"32:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[4], [x27], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 35f\n"
"33:" // Oddments: Load input (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v13.s }[0], [x27], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[2], [x27], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 35f\n"
"34:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x27], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x26, [x13, #0x60]\n"
+ "ldr x20, [x14, #0x60]\n"
"fmla v25.8h, v5.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x26, x26, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 37f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[6], [x26], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 39f\n"
"36:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[4], [x26], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 39f\n"
"37:" // Oddments: Load input (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[2], [x26], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 39f\n"
"38:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x26], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x23, [x13, #0x68]\n"
+ "ldr x20, [x14, #0x68]\n"
"fmla v26.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 41f\n"
- "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 43f\n"
"40:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 43f\n"
"41:" // Oddments: Load input (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 43f\n"
"42:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x25, [x13, #0x70]\n"
+ "ldr x20, [x14, #0x70]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
"tbz %x[n_channels], #2, 45f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 47f\n"
"44:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 47f\n"
"45:" // Oddments: Load input (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 47f\n"
"46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x24, [x13, #0x78]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 49f\n"
- "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v13.h }[6], [x24], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 51f\n"
"48:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v13.h }[4], [x24], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 51f\n"
"49:" // Oddments: Load input (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v13.h }[2], [x24], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 51f\n"
"50:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x24], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x10, [x13, #0x80]\n"
+ "ldr x20, [x14, #0x80]\n"
"fmla v29.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 53f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v12.s }[2], [x10], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[6], [x10], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 55f\n"
"52:" // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[4], [x10], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 55f\n"
"53:" // Oddments: Load input (1, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v12.s }[0], [x10], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[2], [x10], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 55f\n"
"54:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x10], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x9, [x13, #0x88]\n"
+ "ldr x20, [x14, #0x88]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 57f\n"
- "ld1 { v11.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v11.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[6], [x9], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 59f\n"
"56:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[4], [x9], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 59f\n"
"57:" // Oddments: Load input (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 58f\n"
- "ld1 { v11.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 59f\n"
"58:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x28, [x13, #0x90]\n"
+ "ldr x20, [x14, #0x90]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 61f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
- "ld1 { v13.s }[2], [x28], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v13.h }[6], [x28], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 63f\n"
"60:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v13.h }[4], [x28], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 63f\n"
"61:" // Oddments: Load input (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 62f\n"
- "ld1 { v13.s }[0], [x28], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v13.h }[2], [x28], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 63f\n"
"62:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x28], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x27, [x13, #0x98]\n"
+ "ldr x20, [x14, #0x98]\n"
"fmla v30.8h, v8.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 65f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v12.h }[6], [x27], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 67f\n"
"64:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v12.h }[4], [x27], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 67f\n"
"65:" // Oddments: Load input (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 66f\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 67f\n"
"66:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x26, [x13, #0xa0]\n"
+ "ldr x20, [x14, #0xa0]\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x26, x26, x12\n"
+ "add x20, x20, x13\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 69f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v11.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 71f\n"
"68:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v11.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 71f\n"
"69:" // Oddments: Load input (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 70f\n"
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v11.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 71f\n"
"70:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x23, [x13, #0xa8]\n"
+ "ldr x20, [x14, #0xa8]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"fmla v25.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 73f\n"
- "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v13.h }[6], [x23], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 75f\n"
"72:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v13.h }[4], [x23], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 75f\n"
"73:" // Oddments: Load input (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 74f\n"
- "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v13.h }[2], [x23], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 75f\n"
"74:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x23], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x25, [x13, #0xb0]\n"
+ "ldr x20, [x14, #0xb0]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 77f\n"
- "ld1 { v12.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
- "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v12.h }[6], [x25], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 79f\n"
"76:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v12.h }[4], [x25], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 79f\n"
"77:" // Oddments: Load input (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 78f\n"
- "ld1 { v12.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v12.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 79f\n"
"78:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x24, [x13, #0xb8]\n"
+ "ldr x20, [x14, #0xb8]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 81f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 83f\n"
"80:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 83f\n"
"81:" // Oddments: Load input (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 82f\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 83f\n"
"82:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x10, [x13, #0xc0]\n"
+ "ldr x20, [x14, #0xc0]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"tbz %x[n_channels], #2, 85f\n"
- "ld1 { v13.d }[0], [x10], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
- "ld1 { v13.s }[2], [x10], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v13.h }[6], [x10], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 87f\n"
"84:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v13.h }[4], [x10], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 87f\n"
"85:" // Oddments: Load input (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 86f\n"
- "ld1 { v13.s }[0], [x10], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v13.h }[2], [x10], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 87f\n"
"86:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x10], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"87:" // Oddments: Load input (4, 2): Bit 2: End
"fmla v29.8h, v8.8h, v13.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
@@ -1074,216 +1074,216 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 89f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.d }[0], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.d }[0], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "st1 { v25.d }[0], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "st1 { v26.d }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.d }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "add x11, x11, #0x8\n"
- "st1 { v28.d }[0], [x21]\n"
- "st1 { v29.d }[0], [x20]\n"
- "st1 { v30.d }[0], [x19]\n"
- "st1 { v31.d }[0], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.d }[0], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.d }[0], [x22]\n"
+ "st1 { v25.d }[0], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "add x12, x12, #0x8\n"
+ "st1 { v28.d }[0], [x22]\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v30.d }[0], [x20]\n"
+ "st1 { v31.d }[0], [x23]\n"
"tbz %x[n_channels], #1, 88f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.s }[2], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x21, x21, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.s }[2], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v25.s }[2], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v26.s }[2], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.s }[2], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "add x11, x11, #0x4\n"
- "st1 { v28.s }[2], [x21]\n"
- "st1 { v29.s }[2], [x20]\n"
- "st1 { v30.s }[2], [x19]\n"
- "st1 { v31.s }[2], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x23]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.h }[6], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x21, x21, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.h }[6], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v25.h }[6], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v26.h }[6], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.h }[6], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v28.h }[6], [x21]\n"
- "st1 { v29.h }[6], [x20]\n"
- "st1 { v30.h }[6], [x19]\n"
- "st1 { v31.h }[6], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "st1 { v25.h }[6], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[6], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.h }[6], [x22]\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v30.h }[6], [x20]\n"
+ "st1 { v31.h }[6], [x23]\n"
"b 91f\n"
"88:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.h }[4], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.h }[4], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v25.h }[4], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v26.h }[4], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.h }[4], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v28.h }[4], [x21]\n"
- "st1 { v29.h }[4], [x20]\n"
- "st1 { v30.h }[4], [x19]\n"
- "st1 { v31.h }[4], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "st1 { v25.h }[4], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[4], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.h }[4], [x22]\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v30.h }[4], [x20]\n"
+ "st1 { v31.h }[4], [x23]\n"
"b 91f\n"
"89:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.s }[0], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.s }[0], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v25.s }[0], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v26.s }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.s }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "add x11, x11, #0x4\n"
- "st1 { v28.s }[0], [x21]\n"
- "st1 { v29.s }[0], [x20]\n"
- "st1 { v30.s }[0], [x19]\n"
- "st1 { v31.s }[0], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.s }[0], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.s }[0], [x22]\n"
+ "st1 { v25.s }[0], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v28.s }[0], [x22]\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v30.s }[0], [x20]\n"
+ "st1 { v31.s }[0], [x23]\n"
"tbz %x[n_channels], #0, 91f\n"
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.h }[2], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x21, x21, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.h }[2], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v25.h }[2], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v26.h }[2], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v27.h }[2], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v28.h }[2], [x21]\n"
- "st1 { v29.h }[2], [x20]\n"
- "st1 { v30.h }[2], [x19]\n"
- "st1 { v31.h }[2], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "st1 { v25.h }[2], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.h }[2], [x22]\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v31.h }[2], [x23]\n"
"b 91f\n"
"90:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x22, [x15, #0x0]\n"
- "add x22, x22, x11\n"
- "st1 { v23.h }[0], [x22]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "ldr x22, [x15, #0x20]\n"
- "add x19, x19, x11\n"
- "add x22, x22, x11\n"
- "st1 { v24.h }[0], [x21]\n"
- "st1 { v25.h }[0], [x20]\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x21, x21, x11\n"
- "st1 { v26.h }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v27.h }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v28.h }[0], [x21]\n"
- "st1 { v29.h }[0], [x20]\n"
- "st1 { v30.h }[0], [x19]\n"
- "st1 { v31.h }[0], [x22]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.h }[0], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x22, x22, x12\n"
+ "add x21, x21, x12\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x20, x20, x12\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.h }[0], [x22]\n"
+ "st1 { v25.h }[0], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v27.h }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.h }[0], [x22]\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v31.h }[0], [x23]\n"
"91:" // Oddments: Store: Bit 2: End
"92:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index b5bee7ae7c..a3a372be05 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,187 +87,186 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x27, #0x0\n"
"mov x26, #0x0\n"
- "mov x25, #0x0\n"
"1:" // Tile loop
- "str x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x24, #0x4\n"
- "mov x22, #0x4\n"
- "str x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x26, x23\n" // offset = tile_i * ld_input_row
- "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x26, x21\n" // offset = tile_i * ld_output_row
- "mov x7, #0x10\n" // cntb _, ALL, #1
- "madd x20, x25, x5, x20\n" // offset += tile_j * ld_input_col
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x23, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x4, x4, #0x1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
"lsl x5, x5, #0x1\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x19, x25, x6, x19\n" // offset += tile_j * ld_output_col
- "lsl x6, x6, #0x1\n"
- "add x16, x5, x5\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x20, x20, x24\n" // offset *= kernel_stride * output_size
- "add x8, x8, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x14, x8, x23, LSL #1\n"
- "mul x19, x19, x22\n" // offset *= output_tile_size
- "add x13, x14, x23, LSL #1\n"
- "add x17, x17, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "lsr x12, %x[n_channels], #0x3\n"
- "add x11, x13, x23, LSL #1\n"
- "add x10, x16, x5\n"
- "add x9, x17, x21, LSL #1\n"
- "add x28, x11, x23, LSL #1\n"
- "add x27, x10, x5\n"
- "add x26, x9, x21, LSL #1\n"
- "add x22, x6, x6\n"
+ "add x17, x4, x4\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x7, x7, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x15, x7, x24, LSL #1\n"
+ "mul x20, x20, x23\n" // offset *= output_tile_size
+ "add x14, x15, x24, LSL #1\n"
+ "add x8, x8, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "lsr x13, %x[n_channels], #0x3\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x17, x4\n"
+ "add x10, x8, x22, LSL #1\n"
+ "add x9, x12, x24, LSL #1\n"
+ "add x28, x11, x4\n"
+ "add x27, x10, x22, LSL #1\n"
+ "add x23, x5, x5\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v15.8h }, [x20]\n"
- "ld1r { v14.8h }, [x19]\n"
- "add x25, x28, x23, LSL #1\n"
- "add x24, x27, x5\n"
- "add x23, x26, x21, LSL #1\n"
- "add x21, x22, x6\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x7\n"
- "cbz x12, 4f\n"
- "ldr q13, [x15, #0x0]\n"
- "cmp x7, x12, LSL #4\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr q9, [x13, x16]\n"
- "add x15, x15, #0xa0\n"
- "ld1 { v10.8h }, [x8]\n"
- "ldr q11, [x8, x24]\n"
- "ldr q12, [x13, x10]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x26, x9, x24, LSL #1\n"
+ "add x25, x28, x4\n"
+ "add x24, x27, x22, LSL #1\n"
+ "add x22, x23, x5\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x6\n"
+ "cbz x13, 4f\n"
+ "ldr q13, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x6, x13, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x14, x17]\n"
+ "ld1 { v10.8h }, [x7]\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "add x7, x7, #0x10\n"
- "cmp x7, x12, LSL #4\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x13, LSL #4\n"
"mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
"mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "add x19, x19, #0x10\n"
"add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
"mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
"mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
"mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x11, x16]\n"
+ "ldr q9, [x12, x17]\n"
"fmla v16.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x26]\n"
"mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ld1 { v10.8h }, [x25]\n"
- "ldr q11, [x25, x24]\n"
+ "ldr q11, [x26, x25]\n"
"fmla v22.8h, v4.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x11, x10]\n"
+ "ldr q10, [x12, x11]\n"
"fmla v21.8h, v7.8h, v9.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x8, x5]\n"
+ "ldr q12, [x7, x4]\n"
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x7, x28]\n"
"fmla v22.8h, v6.8h, v9.8h\n"
- "ldr q11, [x8, x27]\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q13, [x15, #0x0]\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v21.8h, v8.8h, v10.8h\n"
- "ld1 { v9.8h }, [x14]\n"
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x24]\n"
+ "ldr q12, [x15, x25]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x28]\n"
+ "ld1 { v11.8h }, [x9]\n"
"fmla v22.8h, v7.8h, v10.8h\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
"fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
"fmla v31.8h, v0.8h, v10.8h\n"
- "ldr q10, [x14, x16]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "ldr q11, [x28, x24]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
+ "ldr q11, [x9, x25]\n"
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x11]\n"
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v20.8h, v0.8h, v9.8h\n"
+ "fmla v21.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v3.8h, v9.8h\n"
+ "ldr q11, [x26, x4]\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "ldr q12, [x14, x10]\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x25, x5]\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v16.8h, v5.8h, v10.8h\n"
+ "ldr q10, [x14, x4]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
- "ldr q10, [x13, x5]\n"
"fmla v18.8h, v4.8h, v12.8h\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x13, x27]\n"
+ "ldr q12, [x14, x28]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x25, x27]\n"
+ "ldr q11, [x26, x28]\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "ldr q10, [x8, x16]\n"
+ "ldr q10, [x7, x17]\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x11, x5]\n"
+ "ldr q11, [x12, x4]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
"fmla v27.8h, v1.8h, v12.8h\n"
- "ldr q12, [x8, x10]\n"
- "add x8, x8, #0x10\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x27]\n"
+ "ldr q11, [x12, x28]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
"fmla v18.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x13]\n"
+ "ld1 { v10.8h }, [x14]\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
@@ -277,25 +276,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x16]\n"
+ "ldr q11, [x9, x17]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "ldr q12, [x13, x24]\n"
- "add x13, x13, #0x10\n"
+ "ldr q12, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
"fmla v16.8h, v6.8h, v10.8h\n"
- "ld1 { v10.8h }, [x11]\n"
+ "ld1 { v10.8h }, [x12]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q9, [x13, x16]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
"fmla v27.8h, v2.8h, v12.8h\n"
- "ldr q12, [x11, x24]\n"
- "add x11, x11, #0x10\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x25, x16]\n"
+ "ldr q10, [x26, x17]\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
"fmla v30.8h, v6.8h, v10.8h\n"
@@ -303,108 +301,110 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q11, [x28, x10]\n"
+ "ldr q11, [x9, x11]\n"
"fmla v27.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
"fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
+ "ldr q12, [x26, x11]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x14, x5]\n"
+ "ldr q10, [x15, x4]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla v27.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
- "ldr q11, [x14, x27]\n"
- "add x14, x14, #0x10\n"
+ "add x15, x15, #0x10\n"
"fmla v30.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x28, x5]\n"
+ "ldr q12, [x9, x4]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
"fmax v16.8h, v16.8h, v15.8h\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x27]\n"
- "fmax v17.8h, v17.8h, v15.8h\n"
+ "ldr q10, [x9, x28]\n"
+ "ldr q9, [x14, x17]\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "fmax v18.8h, v18.8h, v15.8h\n"
- "add x28, x28, #0x10\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "add x9, x9, #0x10\n"
"fmla v22.8h, v2.8h, v11.8h\n"
+ "ldr q13, [x16, #0x0]\n"
"fmla v23.8h, v1.8h, v11.8h\n"
- "fmax v19.8h, v19.8h, v15.8h\n"
- "ldr q11, [x8, x24]\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q0, [x16, #0x10]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
- "ldr q0, [x15, #0x10]\n"
+ "ldr q1, [x16, #0x20]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "ldr q12, [x13, x10]\n"
+ "ldr q12, [x14, x11]\n"
+ "ldr q2, [x16, #0x30]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr q3, [x16, #0x40]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "ldr q1, [x15, #0x20]\n"
+ "ldr q6, [x16, #0x70]\n"
"fmla v30.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x16, #0x60]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x7]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
"fmax v23.8h, v23.8h, v15.8h\n"
- "ld1 { v10.8h }, [x8]\n"
"fmax v24.8h, v24.8h, v15.8h\n"
"fmax v25.8h, v25.8h, v15.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
"fmax v26.8h, v26.8h, v15.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
"fmax v28.8h, v28.8h, v15.8h\n"
"fmax v29.8h, v29.8h, v15.8h\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
"fmax v30.8h, v30.8h, v15.8h\n"
"fmax v31.8h, v31.8h, v15.8h\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x17]\n"
+ "st1 { v16.8h }, [x8]\n"
+ "ldr q7, [x16, #0x80]\n"
"fmin v18.8h, v18.8h, v14.8h\n"
"fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x17, x6]\n"
+ "str q17, [x8, x5]\n"
+ "ldr q8, [x16, #0x90]\n"
"fmin v20.8h, v20.8h, v14.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x17, x22]\n"
+ "str q18, [x8, x23]\n"
+ "add x16, x16, #0xa0\n"
"fmin v22.8h, v22.8h, v14.8h\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x17, x21]\n"
- "add x17, x17, #0x10\n"
+ "str q19, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
"fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x9]\n"
+ "st1 { v20.8h }, [x10]\n"
"fmin v26.8h, v26.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x9, x6]\n"
+ "str q21, [x10, x5]\n"
"fmin v28.8h, v28.8h, v14.8h\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x9, x22]\n"
+ "str q22, [x10, x23]\n"
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x9, x21]\n"
- "add x9, x9, #0x10\n"
- "st1 { v24.8h }, [x26]\n"
- "str q25, [x26, x6]\n"
- "str q26, [x26, x22]\n"
- "str q27, [x26, x21]\n"
- "add x26, x26, #0x10\n"
- "st1 { v28.8h }, [x23]\n"
- "str q29, [x23, x6]\n"
- "str q30, [x23, x22]\n"
- "str q31, [x23, x21]\n"
- "add x23, x23, #0x10\n"
+ "str q23, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v24.8h }, [x27]\n"
+ "str q25, [x27, x5]\n"
+ "str q26, [x27, x23]\n"
+ "str q27, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v28.8h }, [x24]\n"
+ "str q29, [x24, x5]\n"
+ "str q30, [x24, x23]\n"
+ "str q31, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
@@ -417,107 +417,107 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
"mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x11, x16]\n"
+ "ldr q9, [x12, x17]\n"
"fmla v16.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x26]\n"
"mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ld1 { v10.8h }, [x25]\n"
- "ldr q11, [x25, x24]\n"
+ "ldr q11, [x26, x25]\n"
"fmla v22.8h, v4.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x11, x10]\n"
+ "ldr q10, [x12, x11]\n"
"fmla v21.8h, v7.8h, v9.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
"mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x8, x5]\n"
+ "ldr q12, [x7, x4]\n"
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x7, x28]\n"
"fmla v22.8h, v6.8h, v9.8h\n"
- "ldr q11, [x8, x27]\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v21.8h, v8.8h, v10.8h\n"
- "ld1 { v9.8h }, [x14]\n"
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "ldr q12, [x14, x24]\n"
+ "ldr q12, [x15, x25]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "ld1 { v11.8h }, [x28]\n"
+ "ld1 { v11.8h }, [x9]\n"
"fmla v22.8h, v7.8h, v10.8h\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
"fmla v27.8h, v3.8h, v10.8h\n"
- "fmla v29.8h, v2.8h, v10.8h\n"
- "fmla v30.8h, v1.8h, v10.8h\n"
"fmla v31.8h, v0.8h, v10.8h\n"
- "ldr q10, [x14, x16]\n"
- "fmla v20.8h, v0.8h, v9.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "fmla v21.8h, v1.8h, v10.8h\n"
- "ldr q11, [x28, x24]\n"
- "fmla v16.8h, v3.8h, v9.8h\n"
+ "ldr q11, [x9, x25]\n"
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x11]\n"
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v20.8h, v0.8h, v9.8h\n"
+ "fmla v21.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v3.8h, v9.8h\n"
+ "ldr q11, [x26, x4]\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "ldr q12, [x14, x10]\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x25, x5]\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v16.8h, v5.8h, v10.8h\n"
+ "ldr q10, [x14, x4]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
- "ldr q10, [x13, x5]\n"
"fmla v18.8h, v4.8h, v12.8h\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x13, x27]\n"
+ "ldr q12, [x14, x28]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x25, x27]\n"
+ "ldr q11, [x26, x28]\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "ldr q10, [x8, x16]\n"
+ "ldr q10, [x7, x17]\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x11, x5]\n"
+ "ldr q11, [x12, x4]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
"fmla v27.8h, v1.8h, v12.8h\n"
- "ldr q12, [x8, x10]\n"
- "add x8, x8, #0x10\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x11, x27]\n"
+ "ldr q11, [x12, x28]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
"fmla v18.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x13]\n"
+ "ld1 { v10.8h }, [x14]\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
@@ -527,24 +527,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x28, x16]\n"
+ "ldr q11, [x9, x17]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "ldr q12, [x13, x24]\n"
- "add x13, x13, #0x10\n"
+ "ldr q12, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
"fmla v16.8h, v6.8h, v10.8h\n"
- "ld1 { v10.8h }, [x11]\n"
+ "ld1 { v10.8h }, [x12]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
"fmla v27.8h, v2.8h, v12.8h\n"
- "ldr q12, [x11, x24]\n"
- "add x11, x11, #0x10\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q10, [x25, x16]\n"
+ "ldr q10, [x26, x17]\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
"fmla v30.8h, v6.8h, v10.8h\n"
@@ -552,36 +552,36 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "ldr q11, [x28, x10]\n"
+ "ldr q11, [x9, x11]\n"
"fmla v27.8h, v5.8h, v12.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
"fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x25, x10]\n"
+ "ldr q12, [x26, x11]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x14, x5]\n"
+ "ldr q10, [x15, x4]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x25, x25, #0x10\n"
+ "add x26, x26, #0x10\n"
"fmla v27.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
- "ldr q11, [x14, x27]\n"
- "add x14, x14, #0x10\n"
+ "add x15, x15, #0x10\n"
"fmla v30.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x28, x5]\n"
+ "ldr q12, [x9, x4]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
"fmax v16.8h, v16.8h, v15.8h\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x28, x27]\n"
+ "ldr q10, [x9, x28]\n"
"fmax v17.8h, v17.8h, v15.8h\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
"fmax v18.8h, v18.8h, v15.8h\n"
- "add x28, x28, #0x10\n"
+ "add x9, x9, #0x10\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"fmax v19.8h, v19.8h, v15.8h\n"
@@ -607,101 +607,101 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmax v31.8h, v31.8h, v15.8h\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "st1 { v16.8h }, [x17]\n"
+ "st1 { v16.8h }, [x8]\n"
"fmin v18.8h, v18.8h, v14.8h\n"
"fmin v19.8h, v19.8h, v14.8h\n"
- "str q17, [x17, x6]\n"
+ "str q17, [x8, x5]\n"
"fmin v20.8h, v20.8h, v14.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q18, [x17, x22]\n"
+ "str q18, [x8, x23]\n"
"fmin v22.8h, v22.8h, v14.8h\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q19, [x17, x21]\n"
- "add x17, x17, #0x10\n"
+ "str q19, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
"fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
- "st1 { v20.8h }, [x9]\n"
+ "st1 { v20.8h }, [x10]\n"
"fmin v26.8h, v26.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
- "str q21, [x9, x6]\n"
+ "str q21, [x10, x5]\n"
"fmin v28.8h, v28.8h, v14.8h\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "str q22, [x9, x22]\n"
+ "str q22, [x10, x23]\n"
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q23, [x9, x21]\n"
- "add x9, x9, #0x10\n"
- "st1 { v24.8h }, [x26]\n"
- "str q25, [x26, x6]\n"
- "str q26, [x26, x22]\n"
- "str q27, [x26, x21]\n"
- "add x26, x26, #0x10\n"
- "st1 { v28.8h }, [x23]\n"
- "str q29, [x23, x6]\n"
- "str q30, [x23, x22]\n"
- "str q31, [x23, x21]\n"
- "add x23, x23, #0x10\n"
+ "str q23, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v24.8h }, [x27]\n"
+ "str q25, [x27, x5]\n"
+ "str q26, [x27, x23]\n"
+ "str q27, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v28.8h }, [x24]\n"
+ "str q29, [x24, x5]\n"
+ "str q30, [x24, x23]\n"
+ "str q31, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 141f\n"
- "ldr q13, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "add x22, x13, x16\n"
- "add x21, x8, XZR\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "add x20, x8, x24\n"
- "add x19, x13, x10\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
+ "ldr q13, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x23, x14, x17\n"
+ "add x22, x7, XZR\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x21, x7, x25\n"
+ "add x20, x14, x11\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
- "ldr d9, [x22], #0x8\n"
- "ldr d10, [x21], #0x8\n"
- "ldr d11, [x20], #0x8\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v9.s }[2], [x22], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
- "ld1 { v11.s }[2], [x20], #0x4\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[6], [x22]\n"
- "ld1 { v10.h }[6], [x21]\n"
- "ld1 { v11.h }[6], [x20]\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x23]\n"
+ "ld1 { v10.h }[6], [x22]\n"
+ "ld1 { v11.h }[6], [x21]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 8f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[4], [x22]\n"
- "ld1 { v10.h }[4], [x21]\n"
- "ld1 { v11.h }[4], [x20]\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x23]\n"
+ "ld1 { v10.h }[4], [x22]\n"
+ "ld1 { v11.h }[4], [x21]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 8f\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 7f\n"
- "ldr s9, [x22], #0x4\n"
- "ldr s10, [x21], #0x4\n"
- "ldr s11, [x20], #0x4\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[2], [x22]\n"
- "ld1 { v10.h }[2], [x21]\n"
- "ld1 { v11.h }[2], [x20]\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x22, #0x0]\n"
- "ldr h10, [x21, #0x0]\n"
- "ldr h11, [x20, #0x0]\n"
- "ldr h12, [x19, #0x0]\n"
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
"mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
"mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "add x19, x25, XZR\n"
+ "add x20, x26, XZR\n"
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
@@ -721,72 +721,72 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v26.8h, v1.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 12f\n"
"9:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 12f\n"
"10:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 11f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "add x19, x25, x24\n"
+ "add x20, x26, x25\n"
"tbz %x[n_channels], #2, 14f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 16f\n"
"13:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 16f\n"
"14:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 15f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "add x19, x11, x16\n"
+ "add x20, x12, x17\n"
"tbz %x[n_channels], #2, 18f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 20f\n"
"17:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 20f\n"
"18:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 19f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v21.8h, v7.8h, v9.8h\n"
- "add x19, x8, x5\n"
+ "add x20, x7, x4\n"
"fmla v22.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v25.8h, v4.8h, v9.8h\n"
@@ -795,74 +795,74 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
"mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 22f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 24f\n"
"21:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 24f\n"
"22:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "add x19, x8, x27\n"
+ "add x20, x7, x28\n"
"tbz %x[n_channels], #2, 26f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 28f\n"
"25:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 28f\n"
"26:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 27f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: End
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "add x19, x11, x10\n"
+ "add x20, x12, x11\n"
"tbz %x[n_channels], #2, 30f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 32f\n"
"29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 32f\n"
"30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 31f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x19, x14, XZR\n"
+ "add x20, x15, XZR\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
@@ -871,645 +871,645 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v30.8h, v1.8h, v10.8h\n"
"fmla v31.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 34f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 36f\n"
"33:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 36f\n"
"34:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 35f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "add x19, x14, x24\n"
+ "add x20, x15, x25\n"
"tbz %x[n_channels], #2, 38f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 40f\n"
"37:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 40f\n"
"38:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 39f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: End
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "add x19, x28, XZR\n"
+ "add x20, x9, XZR\n"
"tbz %x[n_channels], #2, 42f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 44f\n"
"41:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 44f\n"
"42:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 43f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x19, x14, x16\n"
+ "add x20, x15, x17\n"
"tbz %x[n_channels], #2, 46f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 48f\n"
"45:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 48f\n"
"46:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 47f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x19, x28, x24\n"
+ "add x20, x9, x25\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 50f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 52f\n"
"49:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 52f\n"
"50:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 51f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
"fmla v27.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x19, x14, x10\n"
+ "add x20, x15, x11\n"
"tbz %x[n_channels], #2, 54f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 56f\n"
"53:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 56f\n"
"54:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 55f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x19, x25, x5\n"
+ "add x20, x26, x4\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 58f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 60f\n"
"57:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 60f\n"
"58:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 59f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "add x19, x13, x5\n"
+ "add x20, x14, x4\n"
"tbz %x[n_channels], #2, 62f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 64f\n"
"61:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 64f\n"
"62:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 63f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 64f\n"
"63:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x19, x25, x27\n"
+ "add x20, x26, x28\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 66f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 68f\n"
"65:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 68f\n"
"66:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 67f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 68f\n"
"67:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "add x19, x13, x27\n"
+ "add x20, x14, x28\n"
"tbz %x[n_channels], #2, 70f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 72f\n"
"69:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 72f\n"
"70:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 71f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"72:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x19, x8, x16\n"
+ "add x20, x7, x17\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
"fmla v27.8h, v1.8h, v12.8h\n"
"tbz %x[n_channels], #2, 74f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 76f\n"
"73:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 76f\n"
"74:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 75f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 76f\n"
"75:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x19, x11, x5\n"
+ "add x20, x12, x4\n"
"fmla v18.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 78f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 77f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 80f\n"
"77:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 80f\n"
"78:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 79f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 80f\n"
"79:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"80:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x19, x8, x10\n"
+ "add x20, x7, x11\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 82f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 81f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 84f\n"
"81:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 84f\n"
"82:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 83f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 84f\n"
"83:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"84:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x19, x13, XZR\n"
+ "add x20, x14, XZR\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 86f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 85f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 88f\n"
"85:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 88f\n"
"86:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 87f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 88f\n"
"87:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"88:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x19, x11, x27\n"
+ "add x20, x12, x28\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 90f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 89f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 92f\n"
"89:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 92f\n"
"90:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 91f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 92f\n"
"91:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"92:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x19, x13, x24\n"
+ "add x20, x14, x25\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 94f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 93f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 96f\n"
"93:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 96f\n"
"94:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 95f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 96f\n"
"95:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"96:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x19, x11, XZR\n"
+ "add x20, x12, XZR\n"
"fmla v27.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 98f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 97f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 100f\n"
"97:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 100f\n"
"98:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 99f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 100f\n"
"99:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"100:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x19, x28, x16\n"
+ "add x20, x9, x17\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 102f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 101f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 104f\n"
"101:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 104f\n"
"102:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 103f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 104f\n"
"103:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"104:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x19, x11, x24\n"
+ "add x20, x12, x25\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"tbz %x[n_channels], #2, 106f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 105f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 108f\n"
"105:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 108f\n"
"106:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 107f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 108f\n"
"107:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"108:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x19, x25, x16\n"
+ "add x20, x26, x17\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 110f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 109f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 112f\n"
"109:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 112f\n"
"110:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 111f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 112f\n"
"111:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"112:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x19, x28, x10\n"
+ "add x20, x9, x11\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"tbz %x[n_channels], #2, 114f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 113f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 116f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 116f\n"
"113:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 116f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 116f\n"
"114:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 115f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 116f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 116f\n"
"115:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"116:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x19, x25, x10\n"
+ "add x20, x26, x11\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
"tbz %x[n_channels], #2, 118f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 117f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 120f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 120f\n"
"117:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 120f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 120f\n"
"118:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 119f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 120f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 120f\n"
"119:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"120:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x19, x14, x5\n"
+ "add x20, x15, x4\n"
"fmla v31.8h, v6.8h, v12.8h\n"
"tbz %x[n_channels], #2, 122f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 121f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 124f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 124f\n"
"121:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 124f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 124f\n"
"122:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 123f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 124f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 124f\n"
"123:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"124:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x19, x14, x27\n"
+ "add x20, x15, x28\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 126f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 125f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 128f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 128f\n"
"125:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 128f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 128f\n"
"126:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 127f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 128f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 128f\n"
"127:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"128:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x19, x28, x5\n"
+ "add x20, x9, x4\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 130f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 129f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 132f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 132f\n"
"129:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 132f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 132f\n"
"130:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 131f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 132f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 132f\n"
"131:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"132:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x19, x28, x27\n"
+ "add x20, x9, x28\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 134f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 133f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 136f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 136f\n"
"133:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 136f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 136f\n"
"134:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 135f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 136f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 136f\n"
"135:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"136:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
@@ -1548,186 +1548,186 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 138f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "mov x20, x26\n"
- "st1 { v16.d }[0], [x22], x6\n"
- "mov x19, x23\n"
- "st1 { v20.d }[0], [x21], x6\n"
- "add x17, x17, #0x8\n"
- "st1 { v24.d }[0], [x20], x6\n"
- "add x9, x9, #0x8\n"
- "add x26, x26, #0x8\n"
- "st1 { v28.d }[0], [x19], x6\n"
- "add x23, x23, #0x8\n"
- "st1 { v17.d }[0], [x22], x6\n"
- "st1 { v21.d }[0], [x21], x6\n"
- "st1 { v25.d }[0], [x20], x6\n"
- "st1 { v29.d }[0], [x19], x6\n"
- "st1 { v18.d }[0], [x22], x6\n"
- "st1 { v22.d }[0], [x21], x6\n"
- "st1 { v26.d }[0], [x20], x6\n"
- "st1 { v30.d }[0], [x19], x6\n"
- "st1 { v19.d }[0], [x22]\n"
- "st1 { v23.d }[0], [x21]\n"
- "st1 { v27.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.d }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.d }[0], [x22], x5\n"
+ "st1 { v24.d }[0], [x21], x5\n"
+ "add x8, x8, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x20], x5\n"
+ "add x27, x27, #0x8\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v17.d }[0], [x23], x5\n"
+ "st1 { v21.d }[0], [x22], x5\n"
+ "st1 { v25.d }[0], [x21], x5\n"
+ "st1 { v29.d }[0], [x20], x5\n"
+ "st1 { v18.d }[0], [x23], x5\n"
+ "st1 { v22.d }[0], [x22], x5\n"
+ "st1 { v26.d }[0], [x21], x5\n"
+ "st1 { v30.d }[0], [x20], x5\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 137f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v16.s }[2], [x22], x6\n"
- "st1 { v20.s }[2], [x21], x6\n"
- "add x17, x17, #0x4\n"
- "add x9, x9, #0x4\n"
- "st1 { v24.s }[2], [x20], x6\n"
- "add x26, x26, #0x4\n"
- "add x23, x23, #0x4\n"
- "st1 { v28.s }[2], [x19], x6\n"
- "st1 { v17.s }[2], [x22], x6\n"
- "st1 { v21.s }[2], [x21], x6\n"
- "st1 { v25.s }[2], [x20], x6\n"
- "st1 { v29.s }[2], [x19], x6\n"
- "st1 { v18.s }[2], [x22], x6\n"
- "st1 { v22.s }[2], [x21], x6\n"
- "st1 { v26.s }[2], [x20], x6\n"
- "st1 { v30.s }[2], [x19], x6\n"
- "st1 { v19.s }[2], [x22]\n"
- "st1 { v23.s }[2], [x21]\n"
- "st1 { v27.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[2], [x22], x5\n"
+ "st1 { v24.s }[2], [x21], x5\n"
+ "add x8, x8, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v28.s }[2], [x20], x5\n"
+ "add x27, x27, #0x4\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v17.s }[2], [x23], x5\n"
+ "st1 { v21.s }[2], [x22], x5\n"
+ "st1 { v25.s }[2], [x21], x5\n"
+ "st1 { v29.s }[2], [x20], x5\n"
+ "st1 { v18.s }[2], [x23], x5\n"
+ "st1 { v22.s }[2], [x22], x5\n"
+ "st1 { v26.s }[2], [x21], x5\n"
+ "st1 { v30.s }[2], [x20], x5\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 140f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v16.h }[6], [x22], x6\n"
- "st1 { v20.h }[6], [x21], x6\n"
- "st1 { v24.h }[6], [x20], x6\n"
- "st1 { v28.h }[6], [x19], x6\n"
- "st1 { v17.h }[6], [x22], x6\n"
- "st1 { v21.h }[6], [x21], x6\n"
- "st1 { v25.h }[6], [x20], x6\n"
- "st1 { v29.h }[6], [x19], x6\n"
- "st1 { v18.h }[6], [x22], x6\n"
- "st1 { v22.h }[6], [x21], x6\n"
- "st1 { v26.h }[6], [x20], x6\n"
- "st1 { v30.h }[6], [x19], x6\n"
- "st1 { v19.h }[6], [x22]\n"
- "st1 { v23.h }[6], [x21]\n"
- "st1 { v27.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[6], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[6], [x22], x5\n"
+ "st1 { v24.h }[6], [x21], x5\n"
+ "st1 { v28.h }[6], [x20], x5\n"
+ "st1 { v17.h }[6], [x23], x5\n"
+ "st1 { v21.h }[6], [x22], x5\n"
+ "st1 { v25.h }[6], [x21], x5\n"
+ "st1 { v29.h }[6], [x20], x5\n"
+ "st1 { v18.h }[6], [x23], x5\n"
+ "st1 { v22.h }[6], [x22], x5\n"
+ "st1 { v26.h }[6], [x21], x5\n"
+ "st1 { v30.h }[6], [x20], x5\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "st1 { v23.h }[6], [x22]\n"
+ "st1 { v27.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 140f\n"
"137:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 140f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "st1 { v16.h }[4], [x22], x6\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v20.h }[4], [x21], x6\n"
- "st1 { v24.h }[4], [x20], x6\n"
- "st1 { v28.h }[4], [x19], x6\n"
- "st1 { v17.h }[4], [x22], x6\n"
- "st1 { v21.h }[4], [x21], x6\n"
- "st1 { v25.h }[4], [x20], x6\n"
- "st1 { v29.h }[4], [x19], x6\n"
- "st1 { v18.h }[4], [x22], x6\n"
- "st1 { v22.h }[4], [x21], x6\n"
- "st1 { v26.h }[4], [x20], x6\n"
- "st1 { v30.h }[4], [x19], x6\n"
- "st1 { v19.h }[4], [x22]\n"
- "st1 { v23.h }[4], [x21]\n"
- "st1 { v27.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[4], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[4], [x22], x5\n"
+ "st1 { v24.h }[4], [x21], x5\n"
+ "st1 { v28.h }[4], [x20], x5\n"
+ "st1 { v17.h }[4], [x23], x5\n"
+ "st1 { v21.h }[4], [x22], x5\n"
+ "st1 { v25.h }[4], [x21], x5\n"
+ "st1 { v29.h }[4], [x20], x5\n"
+ "st1 { v18.h }[4], [x23], x5\n"
+ "st1 { v22.h }[4], [x22], x5\n"
+ "st1 { v26.h }[4], [x21], x5\n"
+ "st1 { v30.h }[4], [x20], x5\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "st1 { v23.h }[4], [x22]\n"
+ "st1 { v27.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 140f\n"
"138:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 139f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "st1 { v16.s }[0], [x22], x6\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v20.s }[0], [x21], x6\n"
- "st1 { v24.s }[0], [x20], x6\n"
- "add x17, x17, #0x4\n"
- "add x9, x9, #0x4\n"
- "st1 { v28.s }[0], [x19], x6\n"
- "add x26, x26, #0x4\n"
- "add x23, x23, #0x4\n"
- "st1 { v17.s }[0], [x22], x6\n"
- "st1 { v21.s }[0], [x21], x6\n"
- "st1 { v25.s }[0], [x20], x6\n"
- "st1 { v29.s }[0], [x19], x6\n"
- "st1 { v18.s }[0], [x22], x6\n"
- "st1 { v22.s }[0], [x21], x6\n"
- "st1 { v26.s }[0], [x20], x6\n"
- "st1 { v30.s }[0], [x19], x6\n"
- "st1 { v19.s }[0], [x22]\n"
- "st1 { v23.s }[0], [x21]\n"
- "st1 { v27.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[0], [x22], x5\n"
+ "st1 { v24.s }[0], [x21], x5\n"
+ "add x8, x8, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v28.s }[0], [x20], x5\n"
+ "add x27, x27, #0x4\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v17.s }[0], [x23], x5\n"
+ "st1 { v21.s }[0], [x22], x5\n"
+ "st1 { v25.s }[0], [x21], x5\n"
+ "st1 { v29.s }[0], [x20], x5\n"
+ "st1 { v18.s }[0], [x23], x5\n"
+ "st1 { v22.s }[0], [x22], x5\n"
+ "st1 { v26.s }[0], [x21], x5\n"
+ "st1 { v30.s }[0], [x20], x5\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 140f\n"
- "mov x22, x17\n"
- "mov x21, x9\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v16.h }[2], [x22], x6\n"
- "st1 { v20.h }[2], [x21], x6\n"
- "st1 { v24.h }[2], [x20], x6\n"
- "st1 { v28.h }[2], [x19], x6\n"
- "st1 { v17.h }[2], [x22], x6\n"
- "st1 { v21.h }[2], [x21], x6\n"
- "st1 { v25.h }[2], [x20], x6\n"
- "st1 { v29.h }[2], [x19], x6\n"
- "st1 { v18.h }[2], [x22], x6\n"
- "st1 { v22.h }[2], [x21], x6\n"
- "st1 { v26.h }[2], [x20], x6\n"
- "st1 { v30.h }[2], [x19], x6\n"
- "st1 { v19.h }[2], [x22]\n"
- "st1 { v23.h }[2], [x21]\n"
- "st1 { v27.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[2], [x22], x5\n"
+ "st1 { v24.h }[2], [x21], x5\n"
+ "st1 { v28.h }[2], [x20], x5\n"
+ "st1 { v17.h }[2], [x23], x5\n"
+ "st1 { v21.h }[2], [x22], x5\n"
+ "st1 { v25.h }[2], [x21], x5\n"
+ "st1 { v29.h }[2], [x20], x5\n"
+ "st1 { v18.h }[2], [x23], x5\n"
+ "st1 { v22.h }[2], [x22], x5\n"
+ "st1 { v26.h }[2], [x21], x5\n"
+ "st1 { v30.h }[2], [x20], x5\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "st1 { v27.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 140f\n"
"139:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x22, x17\n"
- "mov x21, x9\n"
- "st1 { v16.h }[0], [x22], x6\n"
- "mov x20, x26\n"
- "mov x19, x23\n"
- "st1 { v20.h }[0], [x21], x6\n"
- "st1 { v24.h }[0], [x20], x6\n"
- "st1 { v28.h }[0], [x19], x6\n"
- "st1 { v17.h }[0], [x22], x6\n"
- "st1 { v21.h }[0], [x21], x6\n"
- "st1 { v25.h }[0], [x20], x6\n"
- "st1 { v29.h }[0], [x19], x6\n"
- "st1 { v18.h }[0], [x22], x6\n"
- "st1 { v22.h }[0], [x21], x6\n"
- "st1 { v26.h }[0], [x20], x6\n"
- "st1 { v30.h }[0], [x19], x6\n"
- "st1 { v19.h }[0], [x22]\n"
- "st1 { v23.h }[0], [x21]\n"
- "st1 { v27.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[0], [x22], x5\n"
+ "st1 { v24.h }[0], [x21], x5\n"
+ "st1 { v28.h }[0], [x20], x5\n"
+ "st1 { v17.h }[0], [x23], x5\n"
+ "st1 { v21.h }[0], [x22], x5\n"
+ "st1 { v25.h }[0], [x21], x5\n"
+ "st1 { v29.h }[0], [x20], x5\n"
+ "st1 { v18.h }[0], [x23], x5\n"
+ "st1 { v22.h }[0], [x22], x5\n"
+ "st1 { v26.h }[0], [x21], x5\n"
+ "st1 { v30.h }[0], [x20], x5\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"140:" // Tile loop: Oddments: Store: Bit 2: End
"141:" // Tile loop: End
- "ldr x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x25, x25, #0x1\n"
- "add x20, x26, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x25, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x26, x26, x20, LT\n"
- "csel x25, x25, XZR, LT\n"
- "cmp x26, x19\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index e493104c03..16326150fd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,211 +98,210 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "mov x17, #0x10\n" // cntb _, ALL, #1
- "lsr x16, %x[n_channels], #0x3\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x3\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v15.8h }, [x20]\n"
- "ld1r { v14.8h }, [x19]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x12, #0x0\n"
- "sub x11, XZR, x17\n"
- "cbz x16, 3f\n"
- "ldp x10, x9, [x13, #0x0]\n"
- "ldp x28, x27, [x13, #0x10]\n"
- "cmp x17, x16, LSL #4\n"
- "ldr q13, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
- "ldr q9, [x10, x12]\n"
- "ldr q10, [x9, x12]\n"
- "ldr q11, [x28, x12]\n"
- "ldr q12, [x27, x12]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "sub x12, XZR, x8\n"
+ "cbz x17, 3f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x8, x17, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x13]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr q12, [x28, x13]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x26, [x13, #0x20]\n"
- "ldr x25, [x13, #0x30]\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr x26, [x14, #0x30]\n"
"mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
"mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x24, [x13, #0x28]\n"
- "ldr x23, [x13, #0x38]\n"
+ "ldr x25, [x14, #0x28]\n"
+ "ldr x24, [x14, #0x38]\n"
"mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
"mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x10, [x13, #0x40]\n"
- "ldr x9, [x13, #0x48]\n"
+ "ldr x11, [x14, #0x40]\n"
+ "ldr x10, [x14, #0x48]\n"
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x28, [x13, #0x50]\n"
- "ldr x27, [x13, #0x58]\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
"mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
"mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x25, x12]\n"
- "ldr x25, [x13, #0x70]\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr x26, [x14, #0x70]\n"
"fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x13]\n"
"mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q10, [x26, x12]\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v22.8h, v4.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x26, [x13, #0x60]\n"
- "ldr x24, [x13, #0x68]\n"
+ "ldr x27, [x14, #0x60]\n"
+ "ldr x25, [x14, #0x68]\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x9, x12]\n"
- "ldr x9, [x13, #0x88]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0x88]\n"
"fmla v21.8h, v7.8h, v9.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
- "ldr x23, [x13, #0x78]\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0x78]\n"
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x11, x13]\n"
"fmla v22.8h, v6.8h, v9.8h\n"
- "ldr q11, [x10, x12]\n"
- "ldr x10, [x13, #0x80]\n"
+ "ldr x11, [x14, #0x80]\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
- "add x11, x11, #0x10\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
- "ldr q13, [x14, #0x0]\n"
+ "add x12, x12, #0x10\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v1.8h, v12.8h\n"
- "ldr q9, [x28, x12]\n"
- "ldr x28, [x13, #0x90]\n"
"fmla v17.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x28, x13]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
- "ldr q12, [x27, x12]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x28, [x14, #0x98]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x26, x12]\n"
- "ldr x26, [x13, #0xa0]\n"
+ "ldr q11, [x27, x13]\n"
+ "ldr x27, [x14, #0xa0]\n"
"fmla v22.8h, v7.8h, v10.8h\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
"fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x26, x13]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0xb8]\n"
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x9, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v29.8h, v2.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x12]\n"
- "ldr x24, [x13, #0xa8]\n"
+ "ldr q10, [x25, x13]\n"
+ "ldr x25, [x14, #0xa8]\n"
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
- "ldr x25, [x13, #0xb0]\n"
+ "ldr q11, [x11, x13]\n"
+ "ldr x11, [x14, #0xc0]\n"
"fmla v17.8h, v4.8h, v10.8h\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
- "ldr q12, [x23, x12]\n"
- "ldr x23, [x13, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x12]\n"
- "ldr x10, [x13, #0xc0]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x9, x12]\n"
- "ldr x9, [x13, #0xc8]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0xc8]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x27, x12]\n"
- "ldr x27, [x13, #0xd8]\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x28, [x14, #0xd8]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x28, x12]\n"
- "ldr x28, [x13, #0xd0]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr x9, [x14, #0xd0]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x26, x12]\n"
- "ldr x26, [x13, #0xe0]\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr x27, [x14, #0xe0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x24, [x13, #0xe8]\n"
+ "ldr x25, [x14, #0xe8]\n"
"fmla v19.8h, v7.8h, v12.8h\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x25, x12]\n"
- "ldr x25, [x13, #0xf0]\n"
+ "ldr q12, [x26, x13]\n"
+ "ldr x26, [x14, #0xf0]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
"fmla v18.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x24, x13]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
- "ldr q10, [x23, x12]\n"
- "ldr x23, [x13, #0xf8]\n"
+ "ldr x24, [x14, #0xf8]\n"
"fmla v21.8h, v6.8h, v11.8h\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x10, x12]\n"
+ "ldr q11, [x11, x13]\n"
"fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x10, [x13, #0x100]\n"
+ "ldr x11, [x14, #0x100]\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
"fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x12]\n"
- "ldr x9, [x13, #0x108]\n"
+ "ldr q12, [x10, x13]\n"
+ "ldr x10, [x14, #0x108]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x13]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
- "ldr q10, [x28, x12]\n"
- "ldr x28, [x13, #0x110]\n"
+ "ldr x9, [x14, #0x110]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x12]\n"
+ "ldr q11, [x28, x13]\n"
"fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x27, [x13, #0x118]\n"
+ "ldr x28, [x14, #0x118]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x13]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
- "ldr q12, [x26, x12]\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x24, x12]\n"
+ "ldr q10, [x25, x13]\n"
"fmla v25.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
@@ -311,292 +310,293 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla v29.8h, v7.8h, v10.8h\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
+ "ldr q11, [x26, x13]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x10, x12]\n"
+ "ldr q10, [x11, x13]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x12]\n"
- "ldp x10, x9, [x13, #0x0]\n"
+ "ldr q11, [x10, x13]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
+ "ldr q12, [x24, x13]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmax v16.8h, v16.8h, v15.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmax v17.8h, v17.8h, v15.8h\n"
- "ldr q9, [x10, x17]\n"
"fmla v19.8h, v4.8h, v11.8h\n"
"fmla v29.8h, v8.8h, v12.8h\n"
"fmax v18.8h, v18.8h, v15.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x28, x12]\n"
+ "ldr q12, [x9, x13]\n"
"fmax v19.8h, v19.8h, v15.8h\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x12]\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
+ "ldr q10, [x28, x13]\n"
+ "ldr q9, [x11, x8]\n"
"fmla v22.8h, v2.8h, v11.8h\n"
+ "ldr q13, [x15, #0x0]\n"
"fmla v23.8h, v1.8h, v11.8h\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x22, x11]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "ldr q1, [x15, #0x20]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "str q17, [x21, x11]\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr q6, [x15, #0x70]\n"
"fmla v27.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q16, [x23, x12]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
"fmin v19.8h, v19.8h, v14.8h\n"
- "str q18, [x20, x11]\n"
+ "str q17, [x22, x12]\n"
+ "ldr x23, [x16, #0x20]\n"
"fmax v20.8h, v20.8h, v15.8h\n"
"fmax v21.8h, v21.8h, v15.8h\n"
- "str q19, [x19, x11]\n"
- "ldr x22, [x15, #0x20]\n"
+ "str q18, [x21, x12]\n"
+ "ldr x22, [x16, #0x28]\n"
"fmax v22.8h, v22.8h, v15.8h\n"
"fmax v23.8h, v23.8h, v15.8h\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr x19, [x15, #0x38]\n"
+ "str q19, [x20, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
+ "ldr q3, [x15, #0x40]\n"
"fmla v30.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x15, #0x60]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x10, x8]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x22, x11]\n"
+ "str q20, [x23, x12]\n"
"fmin v22.8h, v22.8h, v14.8h\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x21, x11]\n"
- "ldr x22, [x15, #0x40]\n"
+ "str q21, [x22, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
"fmax v24.8h, v24.8h, v15.8h\n"
"fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x20, x11]\n"
- "ldr x21, [x15, #0x48]\n"
+ "str q22, [x21, x12]\n"
+ "ldr x22, [x16, #0x48]\n"
"fmax v26.8h, v26.8h, v15.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x19, x11]\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr x19, [x15, #0x58]\n"
- "ldp x28, x27, [x13, #0x10]\n"
+ "str q23, [x20, x12]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
"fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
+ "ldr q11, [x9, x8]\n"
+ "ldr q12, [x28, x8]\n"
"fmin v26.8h, v26.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
- "str q24, [x22, x11]\n"
- "ldr x22, [x15, #0x60]\n"
"fmax v28.8h, v28.8h, v15.8h\n"
"fmax v29.8h, v29.8h, v15.8h\n"
- "str q25, [x21, x11]\n"
- "ldr x21, [x15, #0x68]\n"
+ "str q24, [x23, x12]\n"
+ "ldr x23, [x16, #0x60]\n"
"fmax v30.8h, v30.8h, v15.8h\n"
"fmax v31.8h, v31.8h, v15.8h\n"
- "str q26, [x20, x11]\n"
- "ldr x20, [x15, #0x70]\n"
- "str q27, [x19, x11]\n"
- "ldr x19, [x15, #0x78]\n"
- "ldr q10, [x9, x17]\n"
+ "str q25, [x22, x12]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "str q26, [x21, x12]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x8, x8, #0x10\n"
+ "cmp x8, x17, LSL #4\n"
+ "str q27, [x20, x12]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "ldr q11, [x28, x17]\n"
- "ldr q12, [x27, x17]\n"
- "add x17, x17, #0x10\n"
- "cmp x17, x16, LSL #4\n"
"fmin v29.8h, v29.8h, v14.8h\n"
"fmin v30.8h, v30.8h, v14.8h\n"
- "add x12, x12, #0x10\n"
- "str q28, [x22, x11]\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q29, [x21, x11]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "str q30, [x20, x11]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "str q31, [x19, x11]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
+ "add x13, x13, #0x10\n"
+ "str q28, [x23, x12]\n"
+ "str q29, [x22, x12]\n"
+ "add x15, x15, #0xa0\n"
+ "str q30, [x21, x12]\n"
+ "str q31, [x20, x12]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "ldr x26, [x13, #0x20]\n"
- "ldr x25, [x13, #0x30]\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr x26, [x14, #0x30]\n"
"mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
"mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "ldr x24, [x13, #0x28]\n"
- "ldr x23, [x13, #0x38]\n"
+ "ldr x25, [x14, #0x28]\n"
+ "ldr x24, [x14, #0x38]\n"
"mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
"mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x10, [x13, #0x40]\n"
- "ldr x9, [x13, #0x48]\n"
+ "ldr x11, [x14, #0x40]\n"
+ "ldr x10, [x14, #0x48]\n"
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
- "ldr x28, [x13, #0x50]\n"
- "ldr x27, [x13, #0x58]\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
"mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
"mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
- "ldr q9, [x25, x12]\n"
- "ldr x25, [x13, #0x70]\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr x26, [x14, #0x70]\n"
"fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x13]\n"
"mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "ldr q10, [x26, x12]\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v22.8h, v4.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
- "ldr x26, [x13, #0x60]\n"
- "ldr x24, [x13, #0x68]\n"
+ "ldr x27, [x14, #0x60]\n"
+ "ldr x25, [x14, #0x68]\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
"fmla v18.8h, v7.8h, v12.8h\n"
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "ldr q10, [x9, x12]\n"
- "ldr x9, [x13, #0x88]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0x88]\n"
"fmla v21.8h, v7.8h, v9.8h\n"
"fmla v19.8h, v6.8h, v12.8h\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
- "ldr x23, [x13, #0x78]\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0x78]\n"
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x11, x13]\n"
"fmla v22.8h, v6.8h, v9.8h\n"
- "ldr q11, [x10, x12]\n"
- "ldr x10, [x13, #0x80]\n"
+ "ldr x11, [x14, #0x80]\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
- "add x11, x11, #0x10\n"
- "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "add x12, x12, #0x10\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v1.8h, v12.8h\n"
- "ldr q9, [x28, x12]\n"
- "ldr x28, [x13, #0x90]\n"
"fmla v17.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x28, x13]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
- "ldr q12, [x27, x12]\n"
- "ldr x27, [x13, #0x98]\n"
+ "ldr x28, [x14, #0x98]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q11, [x26, x12]\n"
- "ldr x26, [x13, #0xa0]\n"
+ "ldr q11, [x27, x13]\n"
+ "ldr x27, [x14, #0xa0]\n"
"fmla v22.8h, v7.8h, v10.8h\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
"fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x26, x13]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0xb8]\n"
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x9, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v29.8h, v2.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v10.8h\n"
- "fmla v31.8h, v0.8h, v10.8h\n"
- "ldr q10, [x24, x12]\n"
- "ldr x24, [x13, #0xa8]\n"
+ "ldr q10, [x25, x13]\n"
+ "ldr x25, [x14, #0xa8]\n"
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "fmla v28.8h, v3.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
- "ldr x25, [x13, #0xb0]\n"
+ "ldr q11, [x11, x13]\n"
+ "ldr x11, [x14, #0xc0]\n"
"fmla v17.8h, v4.8h, v10.8h\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
- "ldr q12, [x23, x12]\n"
- "ldr x23, [x13, #0xb8]\n"
- "fmla v27.8h, v8.8h, v11.8h\n"
- "fmla v31.8h, v5.8h, v11.8h\n"
- "ldr q11, [x10, x12]\n"
- "ldr x10, [x13, #0xc0]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
- "ldr q10, [x9, x12]\n"
- "ldr x9, [x13, #0xc8]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0xc8]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
- "ldr q12, [x27, x12]\n"
- "ldr x27, [x13, #0xd8]\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x28, [x14, #0xd8]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "ldr q11, [x28, x12]\n"
- "ldr x28, [x13, #0xd0]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr x9, [x14, #0xd0]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
- "ldr q10, [x26, x12]\n"
- "ldr x26, [x13, #0xe0]\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr x27, [x14, #0xe0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "ldr q11, [x24, x12]\n"
+ "ldr q11, [x25, x13]\n"
"fmla v27.8h, v1.8h, v12.8h\n"
- "ldr x24, [x13, #0xe8]\n"
+ "ldr x25, [x14, #0xe8]\n"
"fmla v19.8h, v7.8h, v12.8h\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
- "ldr q12, [x25, x12]\n"
- "ldr x25, [x13, #0xf0]\n"
+ "ldr q12, [x26, x13]\n"
+ "ldr x26, [x14, #0xf0]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
"fmla v18.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x24, x13]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
- "ldr q10, [x23, x12]\n"
- "ldr x23, [x13, #0xf8]\n"
+ "ldr x24, [x14, #0xf8]\n"
"fmla v21.8h, v6.8h, v11.8h\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q11, [x10, x12]\n"
+ "ldr q11, [x11, x13]\n"
"fmla v27.8h, v4.8h, v11.8h\n"
- "ldr x10, [x13, #0x100]\n"
+ "ldr x11, [x14, #0x100]\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
"fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q12, [x9, x12]\n"
- "ldr x9, [x13, #0x108]\n"
+ "ldr q12, [x10, x13]\n"
+ "ldr x10, [x14, #0x108]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x13]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
- "ldr q10, [x28, x12]\n"
- "ldr x28, [x13, #0x110]\n"
+ "ldr x9, [x14, #0x110]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
- "ldr q11, [x27, x12]\n"
+ "ldr q11, [x28, x13]\n"
"fmla v27.8h, v2.8h, v12.8h\n"
- "ldr x27, [x13, #0x118]\n"
+ "ldr x28, [x14, #0x118]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x13]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
- "ldr q12, [x26, x12]\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "ldr q10, [x24, x12]\n"
+ "ldr q10, [x25, x13]\n"
"fmla v25.8h, v7.8h, v11.8h\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
@@ -605,18 +605,18 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla v29.8h, v7.8h, v10.8h\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x12]\n"
+ "ldr q11, [x26, x13]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
- "ldr q10, [x10, x12]\n"
+ "ldr q10, [x11, x13]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q11, [x9, x12]\n"
+ "ldr q11, [x10, x13]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
- "ldr q12, [x23, x12]\n"
+ "ldr q12, [x24, x13]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmax v16.8h, v16.8h, v15.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
@@ -627,145 +627,145 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmax v18.8h, v18.8h, v15.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v6.8h, v12.8h\n"
- "ldr q12, [x28, x12]\n"
+ "ldr q12, [x9, x13]\n"
"fmax v19.8h, v19.8h, v15.8h\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x27, x12]\n"
+ "ldr q10, [x28, x13]\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
- "str q16, [x22, x11]\n"
+ "str q16, [x23, x12]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
"fmin v18.8h, v18.8h, v14.8h\n"
- "str q17, [x21, x11]\n"
+ "str q17, [x22, x12]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
"fmin v19.8h, v19.8h, v14.8h\n"
- "str q18, [x20, x11]\n"
+ "str q18, [x21, x12]\n"
"fmax v20.8h, v20.8h, v15.8h\n"
"fmax v21.8h, v21.8h, v15.8h\n"
- "str q19, [x19, x11]\n"
- "ldr x22, [x15, #0x20]\n"
+ "str q19, [x20, x12]\n"
+ "ldr x23, [x16, #0x20]\n"
"fmax v22.8h, v22.8h, v15.8h\n"
"fmax v23.8h, v23.8h, v15.8h\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "ldr x19, [x15, #0x38]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmin v20.8h, v20.8h, v14.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x22, x11]\n"
+ "str q20, [x23, x12]\n"
"fmin v22.8h, v22.8h, v14.8h\n"
"fmin v23.8h, v23.8h, v14.8h\n"
- "str q21, [x21, x11]\n"
- "ldr x22, [x15, #0x40]\n"
+ "str q21, [x22, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
"fmax v24.8h, v24.8h, v15.8h\n"
"fmax v25.8h, v25.8h, v15.8h\n"
- "str q22, [x20, x11]\n"
- "ldr x21, [x15, #0x48]\n"
+ "str q22, [x21, x12]\n"
+ "ldr x22, [x16, #0x48]\n"
"fmax v26.8h, v26.8h, v15.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "str q23, [x19, x11]\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr x19, [x15, #0x58]\n"
+ "str q23, [x20, x12]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmin v24.8h, v24.8h, v14.8h\n"
"fmin v25.8h, v25.8h, v14.8h\n"
- "str q24, [x22, x11]\n"
+ "str q24, [x23, x12]\n"
"fmin v26.8h, v26.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
- "str q25, [x21, x11]\n"
- "ldr x22, [x15, #0x60]\n"
+ "str q25, [x22, x12]\n"
+ "ldr x23, [x16, #0x60]\n"
"fmax v28.8h, v28.8h, v15.8h\n"
"fmax v29.8h, v29.8h, v15.8h\n"
- "str q26, [x20, x11]\n"
- "ldr x21, [x15, #0x68]\n"
+ "str q26, [x21, x12]\n"
+ "ldr x22, [x16, #0x68]\n"
"fmax v30.8h, v30.8h, v15.8h\n"
"fmax v31.8h, v31.8h, v15.8h\n"
- "str q27, [x19, x11]\n"
- "ldr x20, [x15, #0x70]\n"
- "ldr x19, [x15, #0x78]\n"
+ "str q27, [x20, x12]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmin v28.8h, v28.8h, v14.8h\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "str q28, [x22, x11]\n"
+ "str q28, [x23, x12]\n"
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
- "str q29, [x21, x11]\n"
- "add x12, x12, #0x10\n"
- "str q30, [x20, x11]\n"
- "str q31, [x19, x11]\n"
+ "str q29, [x22, x12]\n"
+ "add x13, x13, #0x10\n"
+ "str q30, [x21, x12]\n"
+ "str q31, [x20, x12]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 140f\n"
- "ldr x10, [x13, #0x0]\n"
- "ldr x9, [x13, #0x8]\n"
- "ldr x28, [x13, #0x10]\n"
- "ldr x27, [x13, #0x18]\n"
- "mov x11, x12\n"
- "add x10, x10, x12\n"
- "ldr q13, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "add x9, x9, x12\n"
- "add x28, x28, x12\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "add x27, x27, x12\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x13\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x23, [x14, #0x0]\n"
+ "ldr x22, [x14, #0x8]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x21, [x14, #0x10]\n"
+ "ldr x20, [x14, #0x18]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.d }[0], [x10], #0x8\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v9.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x22], #0x8\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.s }[2], [x10], #0x4\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[6], [x10], #0x2\n"
- "ld1 { v10.h }[6], [x9], #0x2\n"
- "ld1 { v11.h }[6], [x28], #0x2\n"
- "ld1 { v12.h }[6], [x27], #0x2\n"
+ "ld1 { v9.h }[6], [x23], #0x2\n"
+ "ld1 { v10.h }[6], [x22], #0x2\n"
+ "ld1 { v11.h }[6], [x21], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 7f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[4], [x10], #0x2\n"
- "ld1 { v10.h }[4], [x9], #0x2\n"
- "ld1 { v11.h }[4], [x28], #0x2\n"
- "ld1 { v12.h }[4], [x27], #0x2\n"
+ "ld1 { v9.h }[4], [x23], #0x2\n"
+ "ld1 { v10.h }[4], [x22], #0x2\n"
+ "ld1 { v11.h }[4], [x21], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 7f\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.s }[0], [x10], #0x4\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[2], [x10], #0x2\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
- "ld1 { v11.h }[2], [x28], #0x2\n"
- "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v9.h }[2], [x23], #0x2\n"
+ "ld1 { v10.h }[2], [x22], #0x2\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 7f\n"
"6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x10], #0x2\n"
- "ld1 { v10.h }[0], [x9], #0x2\n"
- "ld1 { v11.h }[0], [x28], #0x2\n"
- "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v9.h }[0], [x23], #0x2\n"
+ "ld1 { v10.h }[0], [x22], #0x2\n"
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
"mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
"mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "ldr x26, [x13, #0x20]\n"
- "add x26, x26, x12\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x20, x20, x13\n"
"mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
@@ -785,75 +785,75 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla v26.8h, v1.8h, v12.8h\n"
"mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 9f\n"
- "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v10.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 11f\n"
"8:" // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v10.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 11f\n"
"9:" // Oddments: Load input (5, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 11f\n"
"10:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (5, 0): Bit 2: End
- "ldr x24, [x13, #0x28]\n"
+ "ldr x20, [x14, #0x28]\n"
"mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 15f\n"
"12:" // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 15f\n"
"13:" // Oddments: Load input (5, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 15f\n"
"14:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (5, 5): Bit 2: End
- "ldr x25, [x13, #0x30]\n"
+ "ldr x20, [x14, #0x30]\n"
"mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v9.d }[0], [x25], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[6], [x25], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[4], [x25], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: Load input (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v9.s }[0], [x25], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[2], [x25], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x25], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x23, [x13, #0x38]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v21.8h, v7.8h, v9.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"fmla v22.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v25.8h, v4.8h, v9.8h\n"
@@ -862,77 +862,77 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
"mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 21f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 23f\n"
"20:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 23f\n"
"21:" // Oddments: Load input (0, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 23f\n"
"22:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x10, [x13, #0x40]\n"
+ "ldr x20, [x14, #0x40]\n"
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 25f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v11.h }[6], [x10], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 27f\n"
"24:" // Oddments: Load input (0, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v11.h }[4], [x10], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 27f\n"
"25:" // Oddments: Load input (0, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v11.h }[2], [x10], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 27f\n"
"26:" // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x10], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (0, 4): Bit 2: End
- "ldr x9, [x13, #0x48]\n"
+ "ldr x20, [x14, #0x48]\n"
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 29f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v10.h }[6], [x9], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 31f\n"
"28:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v10.h }[4], [x9], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 31f\n"
"29:" // Oddments: Load input (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 31f\n"
"30:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x28, [x13, #0x50]\n"
+ "ldr x20, [x14, #0x50]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
@@ -941,670 +941,670 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla v30.8h, v1.8h, v10.8h\n"
"fmla v31.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 33f\n"
- "ld1 { v9.d }[0], [x28], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v9.s }[2], [x28], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v9.h }[6], [x28], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 35f\n"
"32:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v9.h }[4], [x28], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 35f\n"
"33:" // Oddments: Load input (1, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v9.s }[0], [x28], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v9.h }[2], [x28], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 35f\n"
"34:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x28], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x27, [x13, #0x58]\n"
+ "ldr x20, [x14, #0x58]\n"
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 37f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[6], [x27], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 39f\n"
"36:" // Oddments: Load input (1, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[4], [x27], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 39f\n"
"37:" // Oddments: Load input (1, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 39f\n"
"38:" // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (1, 5): Bit 2: End
- "ldr x26, [x13, #0x60]\n"
+ "ldr x20, [x14, #0x60]\n"
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "add x26, x26, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 41f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v11.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 43f\n"
"40:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v11.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 43f\n"
"41:" // Oddments: Load input (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v11.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 43f\n"
"42:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x24, [x13, #0x68]\n"
+ "ldr x20, [x14, #0x68]\n"
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 45f\n"
- "ld1 { v10.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v10.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v10.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 47f\n"
"44:" // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v10.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 47f\n"
"45:" // Oddments: Load input (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v10.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v10.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 47f\n"
"46:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (1, 2): Bit 2: End
- "ldr x25, [x13, #0x70]\n"
+ "ldr x20, [x14, #0x70]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 49f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 51f\n"
"48:" // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 51f\n"
"49:" // Oddments: Load input (4, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 51f\n"
"50:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 5): Bit 2: End
- "ldr x23, [x13, #0x78]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v27.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 53f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 55f\n"
"52:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 55f\n"
"53:" // Oddments: Load input (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 55f\n"
"54:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x10, [x13, #0x80]\n"
+ "ldr x20, [x14, #0x80]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 57f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[6], [x10], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 59f\n"
"56:" // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[4], [x10], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 59f\n"
"57:" // Oddments: Load input (5, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 58f\n"
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.h }[2], [x10], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 59f\n"
"58:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x10], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (5, 1): Bit 2: End
- "ldr x9, [x13, #0x88]\n"
+ "ldr x20, [x14, #0x88]\n"
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 61f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v10.h }[6], [x9], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 63f\n"
"60:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v10.h }[4], [x9], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 63f\n"
"61:" // Oddments: Load input (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 62f\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 63f\n"
"62:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x28, [x13, #0x90]\n"
+ "ldr x20, [x14, #0x90]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 65f\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v11.h }[6], [x28], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 67f\n"
"64:" // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v11.h }[4], [x28], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 67f\n"
"65:" // Oddments: Load input (5, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 66f\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 67f\n"
"66:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (5, 4): Bit 2: End
- "ldr x27, [x13, #0x98]\n"
+ "ldr x20, [x14, #0x98]\n"
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 69f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v12.h }[6], [x27], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 71f\n"
"68:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v12.h }[4], [x27], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 71f\n"
"69:" // Oddments: Load input (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 70f\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 71f\n"
"70:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x26, [x13, #0xa0]\n"
+ "ldr x20, [x14, #0xa0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x26, x26, x12\n"
+ "add x20, x20, x13\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
"fmla v27.8h, v1.8h, v12.8h\n"
"tbz %x[n_channels], #2, 73f\n"
- "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
- "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v10.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 75f\n"
"72:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v10.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 75f\n"
"73:" // Oddments: Load input (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 74f\n"
- "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 75f\n"
"74:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x24, [x13, #0xa8]\n"
+ "ldr x20, [x14, #0xa8]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"fmla v18.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 77f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 79f\n"
"76:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 79f\n"
"77:" // Oddments: Load input (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 78f\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 79f\n"
"78:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x25, [x13, #0xb0]\n"
+ "ldr x20, [x14, #0xb0]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 81f\n"
- "ld1 { v12.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
- "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v12.h }[6], [x25], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 83f\n"
"80:" // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v12.h }[4], [x25], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 83f\n"
"81:" // Oddments: Load input (0, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 82f\n"
- "ld1 { v12.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v12.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 83f\n"
"82:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (0, 3): Bit 2: End
- "ldr x23, [x13, #0xb8]\n"
+ "ldr x20, [x14, #0xb8]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 85f\n"
- "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
- "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 87f\n"
"84:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 87f\n"
"85:" // Oddments: Load input (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 86f\n"
- "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 87f\n"
"86:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"87:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x10, [x13, #0xc0]\n"
+ "ldr x20, [x14, #0xc0]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 89f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 88f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[6], [x10], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 91f\n"
"88:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[4], [x10], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 91f\n"
"89:" // Oddments: Load input (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[2], [x10], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 91f\n"
"90:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x10], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"91:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x9, [x13, #0xc8]\n"
+ "ldr x20, [x14, #0xc8]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 93f\n"
- "ld1 { v12.d }[0], [x9], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 92f\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[6], [x9], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 95f\n"
"92:" // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[4], [x9], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 95f\n"
"93:" // Oddments: Load input (2, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 94f\n"
- "ld1 { v12.s }[0], [x9], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[2], [x9], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 95f\n"
"94:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x9], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"95:" // Oddments: Load input (2, 5): Bit 2: End
- "ldr x28, [x13, #0xd0]\n"
+ "ldr x20, [x14, #0xd0]\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v27.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 97f\n"
- "ld1 { v10.d }[0], [x28], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 96f\n"
- "ld1 { v10.s }[2], [x28], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v10.h }[6], [x28], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 99f\n"
"96:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v10.h }[4], [x28], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 99f\n"
"97:" // Oddments: Load input (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 98f\n"
- "ld1 { v10.s }[0], [x28], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v10.h }[2], [x28], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 99f\n"
"98:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x28], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"99:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x27, [x13, #0xd8]\n"
+ "ldr x20, [x14, #0xd8]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 101f\n"
- "ld1 { v11.d }[0], [x27], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 100f\n"
- "ld1 { v11.s }[2], [x27], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[6], [x27], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 103f\n"
"100:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[4], [x27], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 103f\n"
"101:" // Oddments: Load input (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 102f\n"
- "ld1 { v11.s }[0], [x27], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[2], [x27], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 103f\n"
"102:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x27], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"103:" // Oddments: Load input (4, 2): Bit 2: End
- "ldr x26, [x13, #0xe0]\n"
+ "ldr x20, [x14, #0xe0]\n"
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x26, x26, x12\n"
+ "add x20, x20, x13\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"tbz %x[n_channels], #2, 105f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 104f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[6], [x26], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 107f\n"
"104:" // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[4], [x26], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 107f\n"
"105:" // Oddments: Load input (3, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 106f\n"
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[2], [x26], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 107f\n"
"106:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x26], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"107:" // Oddments: Load input (3, 5): Bit 2: End
- "ldr x24, [x13, #0xe8]\n"
+ "ldr x20, [x14, #0xe8]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x24, x24, x12\n"
+ "add x20, x20, x13\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 109f\n"
- "ld1 { v10.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 108f\n"
- "ld1 { v10.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v10.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 111f\n"
"108:" // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v10.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 111f\n"
"109:" // Oddments: Load input (5, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 110f\n"
- "ld1 { v10.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v10.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 111f\n"
"110:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 2): Bit 2: End
- "ldr x25, [x13, #0xf0]\n"
+ "ldr x20, [x14, #0xf0]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x25, x25, x12\n"
+ "add x20, x20, x13\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"tbz %x[n_channels], #2, 113f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 112f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 115f\n"
"112:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 115f\n"
- "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 115f\n"
"113:" // Oddments: Load input (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 114f\n"
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 115f\n"
"114:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"115:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x23, [x13, #0xf8]\n"
+ "ldr x20, [x14, #0xf8]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x23, x23, x12\n"
+ "add x20, x20, x13\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
"tbz %x[n_channels], #2, 117f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 116f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 119f\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 119f\n"
"116:" // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 119f\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 119f\n"
"117:" // Oddments: Load input (5, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 118f\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 119f\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 119f\n"
"118:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"119:" // Oddments: Load input (5, 3): Bit 2: End
- "ldr x10, [x13, #0x100]\n"
+ "ldr x20, [x14, #0x100]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x10, x10, x12\n"
+ "add x20, x20, x13\n"
"fmla v31.8h, v6.8h, v12.8h\n"
"tbz %x[n_channels], #2, 121f\n"
- "ld1 { v10.d }[0], [x10], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 120f\n"
- "ld1 { v10.s }[2], [x10], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 123f\n"
- "ld1 { v10.h }[6], [x10], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 123f\n"
"120:" // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 123f\n"
- "ld1 { v10.h }[4], [x10], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 123f\n"
"121:" // Oddments: Load input (1, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 122f\n"
- "ld1 { v10.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 123f\n"
- "ld1 { v10.h }[2], [x10], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 123f\n"
"122:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x10], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"123:" // Oddments: Load input (1, 1): Bit 2: End
- "ldr x9, [x13, #0x108]\n"
+ "ldr x20, [x14, #0x108]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x9, x9, x12\n"
+ "add x20, x20, x13\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 125f\n"
- "ld1 { v11.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 124f\n"
- "ld1 { v11.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 127f\n"
- "ld1 { v11.h }[6], [x9], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 127f\n"
"124:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 127f\n"
- "ld1 { v11.h }[4], [x9], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 127f\n"
"125:" // Oddments: Load input (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 126f\n"
- "ld1 { v11.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 127f\n"
- "ld1 { v11.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 127f\n"
"126:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"127:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x28, [x13, #0x110]\n"
+ "ldr x20, [x14, #0x110]\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x28, x28, x12\n"
+ "add x20, x20, x13\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 129f\n"
- "ld1 { v12.d }[0], [x28], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 128f\n"
- "ld1 { v12.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 131f\n"
- "ld1 { v12.h }[6], [x28], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 131f\n"
"128:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 131f\n"
- "ld1 { v12.h }[4], [x28], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 131f\n"
"129:" // Oddments: Load input (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 130f\n"
- "ld1 { v12.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 131f\n"
- "ld1 { v12.h }[2], [x28], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 131f\n"
"130:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x28], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"131:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x27, [x13, #0x118]\n"
+ "ldr x20, [x14, #0x118]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x27, x27, x12\n"
+ "add x20, x20, x13\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 133f\n"
- "ld1 { v10.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 132f\n"
- "ld1 { v10.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 135f\n"
- "ld1 { v10.h }[6], [x27], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 135f\n"
"132:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 135f\n"
- "ld1 { v10.h }[4], [x27], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 135f\n"
"133:" // Oddments: Load input (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 134f\n"
- "ld1 { v10.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 135f\n"
- "ld1 { v10.h }[2], [x27], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 135f\n"
"134:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x27], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"135:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
@@ -1643,363 +1643,363 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v14.8h\n"
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 137f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "ldr x19, [x15, #0x18]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.d }[0], [x22]\n"
- "st1 { v17.d }[0], [x21]\n"
- "ldr x22, [x15, #0x20]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x22, x22, x11\n"
- "st1 { v18.d }[0], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "st1 { v19.d }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.d }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.d }[0], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.d }[0], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.d }[0], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.d }[0], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.d }[0], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.d }[0], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.d }[0], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "add x11, x11, #0x8\n"
- "st1 { v28.d }[0], [x22]\n"
- "st1 { v29.d }[0], [x21]\n"
- "st1 { v30.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "add x12, x12, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 136f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.s }[2], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "st1 { v17.s }[2], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "st1 { v18.s }[2], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.s }[2], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.s }[2], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.s }[2], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.s }[2], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.s }[2], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.s }[2], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.s }[2], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.s }[2], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.s }[2], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "add x11, x11, #0x4\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.h }[6], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "st1 { v17.h }[6], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "st1 { v18.h }[6], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.h }[6], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.h }[6], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.h }[6], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.h }[6], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.h }[6], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.h }[6], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.h }[6], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.h }[6], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.h }[6], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "st1 { v28.h }[6], [x22]\n"
- "st1 { v29.h }[6], [x21]\n"
- "st1 { v30.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.h }[6], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.h }[6], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[6], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[6], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 139f\n"
"136:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 139f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "add x22, x22, x11\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.h }[4], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x22, x22, x11\n"
- "st1 { v17.h }[4], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v18.h }[4], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.h }[4], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.h }[4], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.h }[4], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.h }[4], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.h }[4], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.h }[4], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.h }[4], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.h }[4], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.h }[4], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "st1 { v28.h }[4], [x22]\n"
- "st1 { v29.h }[4], [x21]\n"
- "st1 { v30.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.h }[4], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.h }[4], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[4], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[4], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 139f\n"
"137:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 138f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "add x22, x22, x11\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.s }[0], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x22, x22, x11\n"
- "st1 { v17.s }[0], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v18.s }[0], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.s }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.s }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.s }[0], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.s }[0], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.s }[0], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.s }[0], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.s }[0], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.s }[0], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.s }[0], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "add x11, x11, #0x4\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.h }[2], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "st1 { v17.h }[2], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "st1 { v18.h }[2], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.h }[2], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.h }[2], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.h }[2], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.h }[2], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.h }[2], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.h }[2], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.h }[2], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.h }[2], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.h }[2], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "st1 { v28.h }[2], [x22]\n"
- "st1 { v29.h }[2], [x21]\n"
- "st1 { v30.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.h }[2], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.h }[2], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[2], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 139f\n"
"138:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "ldr x22, [x15, #0x0]\n"
- "ldr x21, [x15, #0x8]\n"
- "add x22, x22, x11\n"
- "add x21, x21, x11\n"
- "ldr x20, [x15, #0x10]\n"
- "ldr x19, [x15, #0x18]\n"
- "add x20, x20, x11\n"
- "add x19, x19, x11\n"
- "st1 { v16.h }[0], [x22]\n"
- "ldr x22, [x15, #0x20]\n"
- "add x22, x22, x11\n"
- "st1 { v17.h }[0], [x21]\n"
- "ldr x21, [x15, #0x28]\n"
- "add x21, x21, x11\n"
- "st1 { v18.h }[0], [x20]\n"
- "ldr x20, [x15, #0x30]\n"
- "add x20, x20, x11\n"
- "st1 { v19.h }[0], [x19]\n"
- "ldr x19, [x15, #0x38]\n"
- "add x19, x19, x11\n"
- "st1 { v20.h }[0], [x22]\n"
- "ldr x22, [x15, #0x40]\n"
- "add x22, x22, x11\n"
- "st1 { v21.h }[0], [x21]\n"
- "ldr x21, [x15, #0x48]\n"
- "add x21, x21, x11\n"
- "st1 { v22.h }[0], [x20]\n"
- "ldr x20, [x15, #0x50]\n"
- "add x20, x20, x11\n"
- "st1 { v23.h }[0], [x19]\n"
- "ldr x19, [x15, #0x58]\n"
- "add x19, x19, x11\n"
- "st1 { v24.h }[0], [x22]\n"
- "ldr x22, [x15, #0x60]\n"
- "add x22, x22, x11\n"
- "st1 { v25.h }[0], [x21]\n"
- "ldr x21, [x15, #0x68]\n"
- "add x21, x21, x11\n"
- "st1 { v26.h }[0], [x20]\n"
- "ldr x20, [x15, #0x70]\n"
- "add x20, x20, x11\n"
- "st1 { v27.h }[0], [x19]\n"
- "ldr x19, [x15, #0x78]\n"
- "add x19, x19, x11\n"
- "st1 { v28.h }[0], [x22]\n"
- "st1 { v29.h }[0], [x21]\n"
- "st1 { v30.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x12\n"
+ "add x20, x20, x12\n"
+ "st1 { v16.h }[0], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.h }[0], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "add x22, x22, x12\n"
+ "st1 { v18.h }[0], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x20, x20, x12\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "add x22, x22, x12\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "add x20, x20, x12\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "add x22, x22, x12\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[0], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "add x20, x20, x12\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"139:" // Oddments: Store: Bit 2: End
"140:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index a5df51c4f9..268dda531d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,331 +87,331 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x22, #0x0\n"
- "mov x26, #0x0\n"
+ "mov x23, #0x0\n"
+ "mov x27, #0x0\n"
"1:" // Tile loop
- "str x22, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x21, #0x4\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
"mov x25, #0x2\n"
- "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x20, x22, x24\n" // offset = tile_i * ld_input_row
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x20, x26, x7, x20\n" // offset += tile_j * ld_input_col
- "ldr x8, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x7, x7, #0x1\n"
- "mul x19, x22, x23\n" // offset = tile_i * ld_output_row
- "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mov x22, #0x10\n" // cntb _, ALL, #1
- "mul x20, x20, x21\n" // offset *= kernel_stride * output_size
- "add x17, x17, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x15, x17, x24, LSL #1\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x19, x26, x8, x19\n" // offset += tile_j * ld_output_col
- "lsr x21, %x[n_channels], #0x3\n"
- "add x13, x15, x24, LSL #1\n"
- "mul x19, x19, x25\n" // offset *= output_tile_size
- "add x12, x7, x7\n"
- "add x11, x13, x24, LSL #1\n"
- "add x10, x12, x7\n"
- "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x6, x6, #0x1\n"
+ "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x16, x8, x24, LSL #1\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x14, x16, x24, LSL #1\n"
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "add x13, x6, x6\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x13, x6\n"
+ "add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v19.8h }, [x20]\n"
- "ld1r { v18.8h }, [x19]\n"
- "add x9, x11, x24, LSL #1\n"
- "add x28, x10, x7\n"
- "add x27, x16, x23, LSL #1\n"
- "lsl x8, x8, #0x1\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
- "cbz x21, 4f\n"
- "ldr q17, [x14, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "ldr q9, [x13, x12]\n"
- "add x14, x14, #0xa0\n"
- "ld1 { v10.8h }, [x17]\n"
- "ldr q11, [x17, x7]\n"
- "ldr q12, [x17, x10]\n"
- "ldr q13, [x17, x28]\n"
- "ld1 { v14.8h }, [x15]\n"
- "ldr q15, [x15, x7]\n"
- "ldr q16, [x17, x12]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "add x10, x12, x24, LSL #1\n"
+ "add x9, x11, x6\n"
+ "add x28, x17, x21, LSL #1\n"
+ "lsl x7, x7, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldr q9, [x14, x13]\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q12, [x8, x11]\n"
+ "ldr q13, [x8, x9]\n"
+ "ld1 { v14.8h }, [x16]\n"
+ "ldr q15, [x16, x6]\n"
+ "ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "add x22, x22, #0x10\n"
- "add x17, x17, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x8, x8, #0x10\n"
"fmla v28.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x8]\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x15, x28]\n"
- "cmp x22, x21, LSL #4\n"
+ "ldr q12, [x16, x9]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x11]\n"
"fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q11, [x15, x10]\n"
- "ldr q13, [x15, x12]\n"
+ "ldr q13, [x16, x13]\n"
"fmla v28.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
"fmla v29.8h, v0.8h, v16.8h\n"
- "ld1 { v14.8h }, [x11]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x14]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "ld1 { v15.8h }, [x13]\n"
- "ldr q11, [x11, x7]\n"
+ "ldr q11, [x12, x6]\n"
"fmla v28.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x14, x6]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x13, x10]\n"
- "ldr q16, [x13, x7]\n"
+ "ldr q12, [x14, x11]\n"
"mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
"mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "add x19, x19, #0x10\n"
- "add x20, x20, #0x10\n"
+ "ldr q17, [x15, #0x0]\n"
+ "cmp x23, x22, LSL #4\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x11, x10]\n"
- "ld1 { v10.8h }, [x17]\n"
+ "ldr q13, [x12, x11]\n"
+ "add x20, x20, #0x10\n"
"fmla v30.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x9]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q14, [x11, x28]\n"
- "ldr q13, [x9, x7]\n"
+ "ldr q13, [x10, x6]\n"
"fmla v30.8h, v0.8h, v15.8h\n"
+ "ldr q0, [x15, #0x10]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q17, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
+ "add x21, x21, #0x10\n"
"fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
+ "ldr q4, [x15, #0x50]\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q11, [x13, x28]\n"
- "ldr q14, [x9, x10]\n"
+ "ldr q14, [x10, x11]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x10]\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "ld1 { v15.8h }, [x9]\n"
- "add x13, x13, #0x10\n"
+ "ldr q1, [x15, #0x20]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q16, [x11, x12]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q16, [x12, x13]\n"
"fmla v30.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x10, x13]\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "ldr q15, [x9, x12]\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
+ "ldr q3, [x15, #0x40]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x8, x9]\n"
"fmla v31.8h, v7.8h, v14.8h\n"
- "add x11, x11, #0x10\n"
- "ldr q9, [x13, x12]\n"
+ "ld1 { v14.8h }, [x16]\n"
"fmla v29.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x8, x11]\n"
"fmla v30.8h, v5.8h, v16.8h\n"
- "ldr q12, [x17, x10]\n"
- "ldr q13, [x17, x28]\n"
+ "ldr q16, [x8, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
"fmla v31.8h, v6.8h, v15.8h\n"
"fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x9, x28]\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q11, [x10, x9]\n"
+ "ldr q6, [x15, #0x70]\n"
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q15, [x16, x6]\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
"fmax v30.8h, v30.8h, v19.8h\n"
- "add x9, x9, #0x10\n"
"fmax v31.8h, v31.8h, v19.8h\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x13]\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
"fmin v29.8h, v29.8h, v18.8h\n"
- "ldr q11, [x17, x7]\n"
- "ld1 { v14.8h }, [x15]\n"
"fmin v30.8h, v30.8h, v18.8h\n"
"fmin v31.8h, v31.8h, v18.8h\n"
- "ldr q15, [x15, x7]\n"
- "ldr q16, [x17, x12]\n"
- "st1 { v28.8h }, [x16]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "str q29, [x16, x8]\n"
- "add x16, x16, #0x10\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "st1 { v30.8h }, [x27]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "str q31, [x27, x8]\n"
- "add x27, x27, #0x10\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
+ "add x12, x12, #0x10\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v28.8h }, [x17]\n"
+ "add x15, x15, #0xa0\n"
+ "str q29, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "st1 { v30.8h }, [x28]\n"
+ "str q31, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x15, x28]\n"
+ "ldr q12, [x16, x9]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x11]\n"
"fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q11, [x15, x10]\n"
- "ldr q13, [x15, x12]\n"
+ "ldr q13, [x16, x13]\n"
"fmla v28.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
"fmla v29.8h, v0.8h, v16.8h\n"
- "ld1 { v14.8h }, [x11]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x14]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "ld1 { v15.8h }, [x13]\n"
- "ldr q11, [x11, x7]\n"
+ "ldr q11, [x12, x6]\n"
"fmla v28.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x14, x6]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr q12, [x13, x10]\n"
- "ldr q16, [x13, x7]\n"
+ "ldr q12, [x14, x11]\n"
"mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
"mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x11, x10]\n"
+ "ldr q13, [x12, x11]\n"
"fmla v30.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x9]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q14, [x11, x28]\n"
- "ldr q13, [x9, x7]\n"
+ "ldr q13, [x10, x6]\n"
"fmla v30.8h, v0.8h, v15.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q11, [x13, x28]\n"
- "ldr q14, [x9, x10]\n"
+ "ldr q14, [x10, x11]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x10]\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "ld1 { v15.8h }, [x9]\n"
- "add x13, x13, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q16, [x11, x12]\n"
+ "ldr q16, [x12, x13]\n"
"fmax v28.8h, v28.8h, v19.8h\n"
"fmla v30.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x10, x13]\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "ldr q15, [x9, x12]\n"
"fmin v28.8h, v28.8h, v18.8h\n"
"fmla v30.8h, v7.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v14.8h\n"
- "st1 { v28.8h }, [x16]\n"
- "add x11, x11, #0x10\n"
+ "st1 { v28.8h }, [x17]\n"
+ "add x12, x12, #0x10\n"
"fmla v29.8h, v7.8h, v12.8h\n"
"fmla v30.8h, v5.8h, v16.8h\n"
"fmla v31.8h, v6.8h, v15.8h\n"
"fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x9, x28]\n"
+ "ldr q11, [x10, x9]\n"
"fmax v29.8h, v29.8h, v19.8h\n"
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
"fmax v30.8h, v30.8h, v19.8h\n"
- "add x9, x9, #0x10\n"
+ "add x10, x10, #0x10\n"
"fmax v31.8h, v31.8h, v19.8h\n"
"fmin v29.8h, v29.8h, v18.8h\n"
- "str q29, [x16, x8]\n"
- "add x16, x16, #0x10\n"
+ "str q29, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
"fmin v30.8h, v30.8h, v18.8h\n"
"fmin v31.8h, v31.8h, v18.8h\n"
- "st1 { v30.8h }, [x27]\n"
- "str q31, [x27, x8]\n"
- "add x27, x27, #0x10\n"
+ "st1 { v30.8h }, [x28]\n"
+ "str q31, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 81f\n"
- "ldr q17, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "add x26, x13, x12\n"
- "add x25, x17, XZR\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "add x24, x17, x7\n"
- "add x23, x17, x10\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "add x22, x17, x28\n"
- "add x21, x15, XZR\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x20, x15, x7\n"
- "add x19, x17, x12\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "add x27, x14, x13\n"
+ "add x26, x8, XZR\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x25, x8, x6\n"
+ "add x24, x8, x11\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x23, x8, x9\n"
+ "add x22, x16, XZR\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "add x21, x16, x6\n"
+ "add x20, x8, x13\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
- "ldr d9, [x26], #0x8\n"
- "ldr d10, [x25], #0x8\n"
- "ldr d11, [x24], #0x8\n"
- "ldr d12, [x23], #0x8\n"
- "ldr d13, [x22], #0x8\n"
- "ldr d14, [x21], #0x8\n"
- "ldr d15, [x20], #0x8\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d9, [x27], #0x8\n"
+ "ldr d10, [x26], #0x8\n"
+ "ldr d11, [x25], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d14, [x22], #0x8\n"
+ "ldr d15, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
- "ld1 { v14.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[6], [x26]\n"
- "ld1 { v10.h }[6], [x25]\n"
- "ld1 { v11.h }[6], [x24]\n"
- "ld1 { v12.h }[6], [x23]\n"
- "ld1 { v13.h }[6], [x22]\n"
- "ld1 { v14.h }[6], [x21]\n"
- "ld1 { v15.h }[6], [x20]\n"
- "ld1 { v16.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x27]\n"
+ "ld1 { v10.h }[6], [x26]\n"
+ "ld1 { v11.h }[6], [x25]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v13.h }[6], [x23]\n"
+ "ld1 { v14.h }[6], [x22]\n"
+ "ld1 { v15.h }[6], [x21]\n"
+ "ld1 { v16.h }[6], [x20]\n"
"b 8f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[4], [x26]\n"
- "ld1 { v10.h }[4], [x25]\n"
- "ld1 { v11.h }[4], [x24]\n"
- "ld1 { v12.h }[4], [x23]\n"
- "ld1 { v13.h }[4], [x22]\n"
- "ld1 { v14.h }[4], [x21]\n"
- "ld1 { v15.h }[4], [x20]\n"
- "ld1 { v16.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x27]\n"
+ "ld1 { v10.h }[4], [x26]\n"
+ "ld1 { v11.h }[4], [x25]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v13.h }[4], [x23]\n"
+ "ld1 { v14.h }[4], [x22]\n"
+ "ld1 { v15.h }[4], [x21]\n"
+ "ld1 { v16.h }[4], [x20]\n"
"b 8f\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 7f\n"
- "ldr s9, [x26], #0x4\n"
- "ldr s10, [x25], #0x4\n"
- "ldr s11, [x24], #0x4\n"
- "ldr s12, [x23], #0x4\n"
- "ldr s13, [x22], #0x4\n"
- "ldr s14, [x21], #0x4\n"
- "ldr s15, [x20], #0x4\n"
- "ldr s16, [x19], #0x4\n"
+ "ldr s9, [x27], #0x4\n"
+ "ldr s10, [x26], #0x4\n"
+ "ldr s11, [x25], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
+ "ldr s13, [x23], #0x4\n"
+ "ldr s14, [x22], #0x4\n"
+ "ldr s15, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.h }[2], [x26]\n"
- "ld1 { v10.h }[2], [x25]\n"
- "ld1 { v11.h }[2], [x24]\n"
- "ld1 { v12.h }[2], [x23]\n"
- "ld1 { v13.h }[2], [x22]\n"
- "ld1 { v14.h }[2], [x21]\n"
- "ld1 { v15.h }[2], [x20]\n"
- "ld1 { v16.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x27]\n"
+ "ld1 { v10.h }[2], [x26]\n"
+ "ld1 { v11.h }[2], [x25]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x23]\n"
+ "ld1 { v14.h }[2], [x22]\n"
+ "ld1 { v15.h }[2], [x21]\n"
+ "ld1 { v16.h }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x26, #0x0]\n"
- "ldr h10, [x25, #0x0]\n"
- "ldr h11, [x24, #0x0]\n"
- "ldr h12, [x23, #0x0]\n"
- "ldr h13, [x22, #0x0]\n"
- "ldr h14, [x21, #0x0]\n"
- "ldr h15, [x20, #0x0]\n"
- "ldr h16, [x19, #0x0]\n"
+ "ldr h9, [x27, #0x0]\n"
+ "ldr h10, [x26, #0x0]\n"
+ "ldr h11, [x25, #0x0]\n"
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h13, [x23, #0x0]\n"
+ "ldr h14, [x22, #0x0]\n"
+ "ldr h15, [x21, #0x0]\n"
+ "ldr h16, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
- "add x19, x15, x10\n"
+ "add x20, x16, x11\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
@@ -423,383 +423,383 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 10f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 12f\n"
"9:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 12f\n"
"10:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 11f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v29.8h, v4.8h, v11.8h\n"
- "add x19, x15, x28\n"
+ "add x20, x16, x9\n"
"tbz %x[n_channels], #2, 14f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 16f\n"
"13:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 16f\n"
"14:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 15f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v29.8h, v5.8h, v12.8h\n"
- "add x19, x15, x12\n"
+ "add x20, x16, x13\n"
"tbz %x[n_channels], #2, 18f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 20f\n"
"17:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 20f\n"
"18:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 19f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "add x19, x11, XZR\n"
+ "add x20, x12, XZR\n"
"tbz %x[n_channels], #2, 22f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v14.h }[6], [x19]\n"
+ "ld1 { v14.h }[6], [x20]\n"
"b 24f\n"
"21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v14.h }[4], [x19]\n"
+ "ld1 { v14.h }[4], [x20]\n"
"b 24f\n"
"22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "ldr s14, [x19], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v14.h }[2], [x19]\n"
+ "ld1 { v14.h }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x19, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v3.8h, v14.8h\n"
- "add x19, x13, XZR\n"
+ "add x20, x14, XZR\n"
"tbz %x[n_channels], #2, 26f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
- "ld1 { v15.s }[2], [x19], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v15.h }[6], [x19]\n"
+ "ld1 { v15.h }[6], [x20]\n"
"b 28f\n"
"25:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v15.h }[4], [x19]\n"
+ "ld1 { v15.h }[4], [x20]\n"
"b 28f\n"
"26:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 27f\n"
- "ldr s15, [x19], #0x4\n"
+ "ldr s15, [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v15.h }[2], [x19]\n"
+ "ld1 { v15.h }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h15, [x19, #0x0]\n"
+ "ldr h15, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v28.8h, v6.8h, v15.8h\n"
"fmla v30.8h, v0.8h, v15.8h\n"
- "add x19, x11, x7\n"
+ "add x20, x12, x6\n"
"tbz %x[n_channels], #2, 30f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 32f\n"
"29:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 32f\n"
"30:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 31f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v30.8h, v4.8h, v11.8h\n"
- "add x19, x13, x7\n"
+ "add x20, x14, x6\n"
"tbz %x[n_channels], #2, 34f\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v16.h }[6], [x19]\n"
+ "ld1 { v16.h }[6], [x20]\n"
"b 36f\n"
"33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v16.h }[4], [x19]\n"
+ "ld1 { v16.h }[4], [x20]\n"
"b 36f\n"
"34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 35f\n"
- "ldr s16, [x19], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v16.h }[2], [x19]\n"
+ "ld1 { v16.h }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h16, [x19, #0x0]\n"
+ "ldr h16, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v28.8h, v7.8h, v16.8h\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "add x19, x11, x10\n"
+ "add x20, x12, x11\n"
"tbz %x[n_channels], #2, 38f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 40f\n"
"37:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 40f\n"
"38:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 39f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v31.8h, v4.8h, v13.8h\n"
- "add x19, x13, x10\n"
+ "add x20, x14, x11\n"
"tbz %x[n_channels], #2, 42f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 44f\n"
"41:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 44f\n"
"42:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 43f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v29.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "add x19, x11, x28\n"
+ "add x20, x12, x9\n"
"tbz %x[n_channels], #2, 46f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v14.h }[6], [x19]\n"
+ "ld1 { v14.h }[6], [x20]\n"
"b 48f\n"
"45:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v14.h }[4], [x19]\n"
+ "ld1 { v14.h }[4], [x20]\n"
"b 48f\n"
"46:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 47f\n"
- "ldr s14, [x19], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v14.h }[2], [x19]\n"
+ "ld1 { v14.h }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x19, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v31.8h, v5.8h, v14.8h\n"
- "add x19, x9, XZR\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #2, 50f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
- "ld1 { v15.s }[2], [x19], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v15.h }[6], [x19]\n"
+ "ld1 { v15.h }[6], [x20]\n"
"b 52f\n"
"49:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v15.h }[4], [x19]\n"
+ "ld1 { v15.h }[4], [x20]\n"
"b 52f\n"
"50:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 51f\n"
- "ldr s15, [x19], #0x4\n"
+ "ldr s15, [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v15.h }[2], [x19]\n"
+ "ld1 { v15.h }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h15, [x19, #0x0]\n"
+ "ldr h15, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v30.8h, v6.8h, v15.8h\n"
- "add x19, x13, x28\n"
+ "add x20, x14, x9\n"
"tbz %x[n_channels], #2, 54f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 56f\n"
"53:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 56f\n"
"54:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 55f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v29.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "add x19, x9, x7\n"
+ "add x20, x10, x6\n"
"tbz %x[n_channels], #2, 58f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 60f\n"
"57:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 60f\n"
"58:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 59f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v30.8h, v7.8h, v13.8h\n"
- "add x19, x11, x12\n"
+ "add x20, x12, x13\n"
"tbz %x[n_channels], #2, 62f\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v16.h }[6], [x19]\n"
+ "ld1 { v16.h }[6], [x20]\n"
"b 64f\n"
"61:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v16.h }[4], [x19]\n"
+ "ld1 { v16.h }[4], [x20]\n"
"b 64f\n"
"62:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 63f\n"
- "ldr s16, [x19], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v16.h }[2], [x19]\n"
+ "ld1 { v16.h }[2], [x20]\n"
"b 64f\n"
"63:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h16, [x19, #0x0]\n"
+ "ldr h16, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v30.8h, v5.8h, v16.8h\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "add x19, x9, x10\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #2, 66f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v14.h }[6], [x19]\n"
+ "ld1 { v14.h }[6], [x20]\n"
"b 68f\n"
"65:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v14.h }[4], [x19]\n"
+ "ld1 { v14.h }[4], [x20]\n"
"b 68f\n"
"66:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 67f\n"
- "ldr s14, [x19], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v14.h }[2], [x19]\n"
+ "ld1 { v14.h }[2], [x20]\n"
"b 68f\n"
"67:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x19, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v31.8h, v7.8h, v14.8h\n"
- "add x19, x9, x12\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #2, 70f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
- "ld1 { v15.s }[2], [x19], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v15.h }[6], [x19]\n"
+ "ld1 { v15.h }[6], [x20]\n"
"b 72f\n"
"69:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v15.h }[4], [x19]\n"
+ "ld1 { v15.h }[4], [x20]\n"
"b 72f\n"
"70:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 71f\n"
- "ldr s15, [x19], #0x4\n"
+ "ldr s15, [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v15.h }[2], [x19]\n"
+ "ld1 { v15.h }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h15, [x19, #0x0]\n"
+ "ldr h15, [x20, #0x0]\n"
"72:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v15.8h\n"
- "add x19, x9, x28\n"
+ "add x20, x10, x9\n"
"tbz %x[n_channels], #2, 74f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 76f\n"
"73:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 76f\n"
"74:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 75f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 76f\n"
"75:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
"fmax v28.8h, v28.8h, v19.8h\n"
@@ -811,82 +811,82 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v18.8h\n"
"fmin v31.8h, v31.8h, v18.8h\n"
"tbz %x[n_channels], #2, 78f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.d }[0], [x20], x8\n"
- "add x16, x16, #0x8\n"
- "add x27, x27, #0x8\n"
- "st1 { v30.d }[0], [x19], x8\n"
- "st1 { v29.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 77f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.s }[2], [x20], x8\n"
- "add x16, x16, #0x4\n"
- "add x27, x27, #0x4\n"
- "st1 { v30.s }[2], [x19], x8\n"
- "st1 { v29.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[2], [x21], x7\n"
+ "st1 { v30.s }[2], [x20], x7\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 80f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.h }[6], [x20], x8\n"
- "st1 { v30.h }[6], [x19], x8\n"
- "st1 { v29.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[6], [x21], x7\n"
+ "st1 { v30.h }[6], [x20], x7\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 80f\n"
"77:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 80f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.h }[4], [x20], x8\n"
- "st1 { v30.h }[4], [x19], x8\n"
- "st1 { v29.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[4], [x21], x7\n"
+ "st1 { v30.h }[4], [x20], x7\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 80f\n"
"78:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 79f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.s }[0], [x20], x8\n"
- "st1 { v30.s }[0], [x19], x8\n"
- "add x16, x16, #0x4\n"
- "add x27, x27, #0x4\n"
- "st1 { v29.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[0], [x21], x7\n"
+ "st1 { v30.s }[0], [x20], x7\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 80f\n"
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.h }[2], [x20], x8\n"
- "st1 { v30.h }[2], [x19], x8\n"
- "st1 { v29.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[2], [x21], x7\n"
+ "st1 { v30.h }[2], [x20], x7\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 80f\n"
"79:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x20, x16\n"
- "mov x19, x27\n"
- "st1 { v28.h }[0], [x20], x8\n"
- "st1 { v30.h }[0], [x19], x8\n"
- "st1 { v29.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[0], [x21], x7\n"
+ "st1 { v30.h }[0], [x20], x7\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"80:" // Tile loop: Oddments: Store: Bit 2: End
"81:" // Tile loop: End
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x26, x26, #0x1\n"
- "add x20, x22, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x22, x22, x20, LT\n"
- "csel x26, x26, XZR, LT\n"
- "cmp x22, x19\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x27, x27, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x27, x27, XZR, LT\n"
+ "cmp x23, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 61c58186f5..144d11fb39 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,347 +88,347 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x16, #0x10\n" // cntb _, ALL, #1
- "lsr x15, %x[n_channels], #0x3\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "lsr x25, %x[n_channels], #0x3\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
- "ldp x13, x12, [x21, #0x0]\n"
- "ldp x11, x10, [x21, #0x10]\n"
- "add x9, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ld1r { v19.8h }, [x20]\n"
- "ld1r { v18.8h }, [x19]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
"mov x28, #0x0\n"
- "sub x27, XZR, x16\n"
- "cbz x15, 3f\n"
- "ldp x26, x25, [x9, #0x0]\n"
- "ldp x24, x23, [x9, #0x10]\n"
- "ldp x22, x21, [x9, #0x20]\n"
- "ldp x20, x19, [x9, #0x30]\n"
- "cmp x16, x15, LSL #4\n"
- "ldr q17, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
- "ldr q9, [x26, x28]\n"
- "ldr q10, [x25, x28]\n"
- "ldr q11, [x24, x28]\n"
- "ldr q12, [x23, x28]\n"
+ "sub x23, XZR, x26\n"
+ "cbz x25, 3f\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q0, [x24, #0x10]\n"
+ "cmp x26, x25, LSL #4\n"
+ "ldr q1, [x24, #0x20]\n"
+ "ldr q2, [x24, #0x30]\n"
+ "ldr q3, [x24, #0x40]\n"
+ "ldr q4, [x24, #0x50]\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr q7, [x24, #0x80]\n"
+ "ldr q8, [x24, #0x90]\n"
+ "add x24, x24, #0xa0\n"
+ "ldp x22, x20, [x13, #0x0]\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
"ldr q13, [x22, x28]\n"
"ldr q14, [x21, x28]\n"
- "ldr q15, [x20, x28]\n"
- "ldr q16, [x19, x28]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x26, [x9, #0x40]\n"
- "ldr x25, [x9, #0x48]\n"
+ "ldr x22, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x28]\n"
- "ldr x24, [x9, #0x50]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x22, x28]\n"
"fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q11, [x26, x28]\n"
- "ldr q13, [x24, x28]\n"
+ "ldr q13, [x21, x28]\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
- "ldr x23, [x9, #0x58]\n"
- "ldr x19, [x9, #0x78]\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q14, [x20, x28]\n"
"fmla v28.8h, v4.8h, v15.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q14, [x23, x28]\n"
- "ldr x22, [x9, #0x60]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "ldr x22, [x13, #0x60]\n"
+ "ldr q15, [x22, x28]\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x26, [x9, #0x80]\n"
- "ldr q15, [x22, x28]\n"
+ "ldr x22, [x13, #0x80]\n"
+ "ldr q12, [x22, x28]\n"
"mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
"mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q12, [x26, x28]\n"
- "ldr x21, [x9, #0x68]\n"
+ "ldr q17, [x24, #0x0]\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x19, x28]\n"
- "ldr x25, [x9, #0x88]\n"
+ "ldr q13, [x20, x28]\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr q11, [x21, x28]\n"
"fmla v30.8h, v3.8h, v14.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q11, [x21, x28]\n"
- "ldr q14, [x25, x28]\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q14, [x20, x28]\n"
"fmla v30.8h, v0.8h, v15.8h\n"
+ "ldr q0, [x24, #0x10]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x20, [x9, #0x70]\n"
- "ldr x23, [x9, #0x98]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "ldr q16, [x21, x28]\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr q11, [x23, x28]\n"
+ "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x98]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
- "ldr x24, [x9, #0x90]\n"
- "ldr x21, [x9, #0xa8]\n"
"fmla v30.8h, v1.8h, v16.8h\n"
+ "ldr q11, [x20, x28]\n"
+ "ldr q1, [x24, #0x20]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q15, [x24, x28]\n"
- "ldr q16, [x21, x28]\n"
- "ldr x22, [x9, #0xa0]\n"
- "ldr x20, [x9, #0xb0]\n"
+ "ldr q2, [x24, #0x30]\n"
+ "ldr x21, [x13, #0x90]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr x21, [x13, #0xa8]\n"
"fmla v30.8h, v6.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x22, [x13, #0xa0]\n"
"fmla v31.8h, v3.8h, v16.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
"ldr q13, [x22, x28]\n"
- "ldr q14, [x20, x28]\n"
+ "ldr q3, [x24, #0x40]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v14.8h\n"
- "ldr x19, [x9, #0xb8]\n"
- "fmla v29.8h, v7.8h, v12.8h\n"
- "ldr q15, [x19, x28]\n"
"fmla v30.8h, v5.8h, v16.8h\n"
- "ldr x26, [x9, #0xc0]\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "add x23, x23, #0x10\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "ldr q14, [x21, x28]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v31.8h, v7.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "ldr q15, [x20, x28]\n"
+ "ldr q7, [x24, #0x80]\n"
"fmla v31.8h, v6.8h, v15.8h\n"
- "fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
"fmla v30.8h, v8.8h, v15.8h\n"
- "fmla v31.8h, v8.8h, v11.8h\n"
- "ldp x26, x25, [x9, #0x0]\n"
- "ldp x24, x23, [x9, #0x10]\n"
- "ldp x22, x21, [x9, #0x20]\n"
- "ldp x20, x19, [x9, #0x30]\n"
- "fmax v28.8h, v28.8h, v19.8h\n"
- "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr x22, [x13, #0xc0]\n"
"fmax v30.8h, v30.8h, v19.8h\n"
- "fmax v31.8h, v31.8h, v19.8h\n"
- "ldr q9, [x26, x16]\n"
- "ldr q10, [x25, x16]\n"
- "ldr q11, [x24, x16]\n"
- "ldr q12, [x23, x16]\n"
- "add x27, x27, #0x10\n"
- "fmin v28.8h, v28.8h, v18.8h\n"
- "ldr q13, [x22, x16]\n"
- "ldr q14, [x21, x16]\n"
- "fmin v29.8h, v29.8h, v18.8h\n"
"fmin v30.8h, v30.8h, v18.8h\n"
- "ldr q15, [x20, x16]\n"
- "ldr q16, [x19, x16]\n"
- "add x16, x16, #0x10\n"
- "cmp x16, x15, LSL #4\n"
+ "ldr q11, [x22, x28]\n"
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "ldr q8, [x24, #0x90]\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "ldp x22, x20, [x13, #0x0]\n"
+ "ldr q9, [x22, x26]\n"
"fmin v31.8h, v31.8h, v18.8h\n"
"add x28, x28, #0x10\n"
- "str q28, [x13, x27]\n"
- "ldr q17, [x14, #0x0]\n"
- "str q29, [x12, x27]\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
- "str q30, [x11, x27]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "str q31, [x10, x27]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
+ "ldr q10, [x20, x26]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "str q28, [x12, x23]\n"
+ "add x24, x24, #0xa0\n"
+ "ldr q11, [x21, x26]\n"
+ "ldr q12, [x20, x26]\n"
+ "str q29, [x11, x23]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldr q13, [x22, x26]\n"
+ "str q30, [x10, x23]\n"
+ "ldr q14, [x21, x26]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "str q31, [x9, x23]\n"
+ "ldr q15, [x21, x26]\n"
+ "ldr q16, [x20, x26]\n"
+ "add x26, x26, #0x10\n"
+ "cmp x26, x25, LSL #4\n"
"blt 1b\n"
"2:" // Channel tail
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "ldr x26, [x9, #0x40]\n"
- "ldr x25, [x9, #0x48]\n"
+ "ldr x22, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q12, [x25, x28]\n"
- "ldr x24, [x9, #0x50]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x22, x28]\n"
"fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q11, [x26, x28]\n"
- "ldr q13, [x24, x28]\n"
+ "ldr q13, [x21, x28]\n"
"fmla v28.8h, v3.8h, v14.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
- "ldr x23, [x9, #0x58]\n"
- "ldr x19, [x9, #0x78]\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q14, [x20, x28]\n"
"fmla v28.8h, v4.8h, v15.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "ldr q14, [x23, x28]\n"
- "ldr x22, [x9, #0x60]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "ldr x22, [x13, #0x60]\n"
+ "ldr q15, [x22, x28]\n"
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "ldr x26, [x9, #0x80]\n"
- "ldr q15, [x22, x28]\n"
+ "ldr x22, [x13, #0x80]\n"
+ "ldr q12, [x22, x28]\n"
"mov v30.16b, v17.16b\n fmla v30.8h, v2.8h, v9.8h\n"
"mov v31.16b, v17.16b\n fmla v31.8h, v0.8h, v9.8h\n"
- "ldr q12, [x26, x28]\n"
- "ldr x21, [x9, #0x68]\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr q11, [x21, x28]\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q13, [x19, x28]\n"
- "ldr x25, [x9, #0x88]\n"
+ "ldr q13, [x20, x28]\n"
"fmla v30.8h, v3.8h, v14.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "ldr q11, [x21, x28]\n"
- "ldr q14, [x25, x28]\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q14, [x20, x28]\n"
"fmla v30.8h, v0.8h, v15.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr x20, [x9, #0x70]\n"
- "ldr x23, [x9, #0x98]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x98]\n"
"fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x20, x28]\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr q11, [x23, x28]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
- "ldr x24, [x9, #0x90]\n"
- "ldr x21, [x9, #0xa8]\n"
+ "ldr x21, [x13, #0x90]\n"
+ "ldr q15, [x21, x28]\n"
"fmla v30.8h, v1.8h, v16.8h\n"
+ "ldr x21, [x13, #0xa8]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v7.8h, v16.8h\n"
- "ldr q15, [x24, x28]\n"
"ldr q16, [x21, x28]\n"
- "ldr x22, [x9, #0xa0]\n"
- "ldr x20, [x9, #0xb0]\n"
+ "ldr x22, [x13, #0xa0]\n"
+ "ldr q13, [x22, x28]\n"
"fmla v30.8h, v6.8h, v15.8h\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "ldr q13, [x22, x28]\n"
- "ldr q14, [x20, x28]\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "ldr q14, [x21, x28]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v14.8h\n"
- "ldr x19, [x9, #0xb8]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "ldr q15, [x20, x28]\n"
"fmla v29.8h, v7.8h, v12.8h\n"
- "ldr q15, [x19, x28]\n"
"fmla v30.8h, v5.8h, v16.8h\n"
- "ldr x26, [x9, #0xc0]\n"
+ "ldr x22, [x13, #0xc0]\n"
"fmla v31.8h, v6.8h, v15.8h\n"
"fmla v29.8h, v8.8h, v11.8h\n"
- "ldr q11, [x26, x28]\n"
+ "ldr q11, [x22, x28]\n"
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v8.8h, v11.8h\n"
"fmax v28.8h, v28.8h, v19.8h\n"
- "add x27, x27, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmax v29.8h, v29.8h, v19.8h\n"
"fmax v30.8h, v30.8h, v19.8h\n"
"add x28, x28, #0x10\n"
"fmax v31.8h, v31.8h, v19.8h\n"
"fmin v28.8h, v28.8h, v18.8h\n"
- "str q28, [x13, x27]\n"
+ "str q28, [x12, x23]\n"
"fmin v29.8h, v29.8h, v18.8h\n"
"fmin v30.8h, v30.8h, v18.8h\n"
- "str q29, [x12, x27]\n"
+ "str q29, [x11, x23]\n"
"fmin v31.8h, v31.8h, v18.8h\n"
- "str q30, [x11, x27]\n"
- "str q31, [x10, x27]\n"
+ "str q30, [x10, x23]\n"
+ "str q31, [x9, x23]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 80f\n"
- "mov x27, x28\n"
- "ldr x26, [x9, #0x0]\n"
- "ldr x25, [x9, #0x8]\n"
- "ldr x24, [x9, #0x10]\n"
- "add x13, x13, x27\n"
- "add x12, x12, x27\n"
- "ldr x23, [x9, #0x18]\n"
- "ldr x22, [x9, #0x20]\n"
- "add x11, x11, x27\n"
- "add x10, x10, x27\n"
- "ldr x21, [x9, #0x28]\n"
- "ldr x20, [x9, #0x30]\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q0, [x24, #0x10]\n"
+ "mov x23, x28\n"
+ "add x12, x12, x23\n"
+ "ldr q1, [x24, #0x20]\n"
+ "ldr q2, [x24, #0x30]\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "ldr q3, [x24, #0x40]\n"
+ "ldr q4, [x24, #0x50]\n"
+ "add x9, x9, x23\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr q7, [x24, #0x80]\n"
+ "ldr q8, [x24, #0x90]\n"
+ "ldr x27, [x13, #0x0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "add x27, x27, x28\n"
"add x26, x26, x28\n"
+ "ldr x25, [x13, #0x10]\n"
+ "ldr x24, [x13, #0x18]\n"
"add x25, x25, x28\n"
- "ldr x19, [x9, #0x38]\n"
- "ldr q17, [x14, #0x0]\n"
"add x24, x24, x28\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr x22, [x13, #0x28]\n"
"add x23, x23, x28\n"
- "ldr q0, [x14, #0x10]\n"
- "ldr q1, [x14, #0x20]\n"
"add x22, x22, x28\n"
+ "ldr x21, [x13, #0x30]\n"
+ "ldr x20, [x13, #0x38]\n"
"add x21, x21, x28\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
"add x20, x20, x28\n"
- "add x19, x19, x28\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
- "ld1 { v14.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
- "ld1 { v14.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[6], [x26], #0x2\n"
- "ld1 { v10.h }[6], [x25], #0x2\n"
- "ld1 { v11.h }[6], [x24], #0x2\n"
- "ld1 { v12.h }[6], [x23], #0x2\n"
- "ld1 { v13.h }[6], [x22], #0x2\n"
- "ld1 { v14.h }[6], [x21], #0x2\n"
- "ld1 { v15.h }[6], [x20], #0x2\n"
- "ld1 { v16.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x27], #0x2\n"
+ "ld1 { v10.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v12.h }[6], [x24], #0x2\n"
+ "ld1 { v13.h }[6], [x23], #0x2\n"
+ "ld1 { v14.h }[6], [x22], #0x2\n"
+ "ld1 { v15.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
"b 7f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[4], [x26], #0x2\n"
- "ld1 { v10.h }[4], [x25], #0x2\n"
- "ld1 { v11.h }[4], [x24], #0x2\n"
- "ld1 { v12.h }[4], [x23], #0x2\n"
- "ld1 { v13.h }[4], [x22], #0x2\n"
- "ld1 { v14.h }[4], [x21], #0x2\n"
- "ld1 { v15.h }[4], [x20], #0x2\n"
- "ld1 { v16.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x27], #0x2\n"
+ "ld1 { v10.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v12.h }[4], [x24], #0x2\n"
+ "ld1 { v13.h }[4], [x23], #0x2\n"
+ "ld1 { v14.h }[4], [x22], #0x2\n"
+ "ld1 { v15.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
"b 7f\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.s }[0], [x26], #0x4\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v13.s }[0], [x22], #0x4\n"
- "ld1 { v14.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "ld1 { v16.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.h }[2], [x26], #0x2\n"
- "ld1 { v10.h }[2], [x25], #0x2\n"
- "ld1 { v11.h }[2], [x24], #0x2\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
- "ld1 { v13.h }[2], [x22], #0x2\n"
- "ld1 { v14.h }[2], [x21], #0x2\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "ld1 { v16.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x27], #0x2\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "ld1 { v14.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"b 7f\n"
"6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x26], #0x2\n"
- "ld1 { v10.h }[0], [x25], #0x2\n"
- "ld1 { v11.h }[0], [x24], #0x2\n"
- "ld1 { v12.h }[0], [x23], #0x2\n"
- "ld1 { v13.h }[0], [x22], #0x2\n"
- "ld1 { v14.h }[0], [x21], #0x2\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "ld1 { v16.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x27], #0x2\n"
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "ld1 { v14.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
"mov v28.16b, v17.16b\n fmla v28.8h, v8.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
- "ldr x26, [x9, #0x40]\n"
- "add x26, x26, x28\n"
+ "ldr x20, [x13, #0x40]\n"
+ "add x20, x20, x28\n"
"mov v29.16b, v17.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
@@ -440,143 +440,143 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"fmla v28.8h, v2.8h, v16.8h\n"
"fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 9f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 11f\n"
"8:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 11f\n"
"9:" // Oddments: Load input (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 11f\n"
"10:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x25, [x9, #0x48]\n"
+ "ldr x20, [x13, #0x48]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "add x25, x25, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v12.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v12.h }[6], [x25], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 15f\n"
"12:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v12.h }[4], [x25], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 15f\n"
"13:" // Oddments: Load input (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v12.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v12.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 15f\n"
"14:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x24, [x9, #0x50]\n"
+ "ldr x20, [x13, #0x50]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "add x24, x24, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[6], [x24], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[4], [x24], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: Load input (1, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.h }[2], [x24], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x24], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (1, 2): Bit 2: End
- "ldr x23, [x9, #0x58]\n"
+ "ldr x20, [x13, #0x58]\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "add x23, x23, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 21f\n"
- "ld1 { v14.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v14.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v14.h }[6], [x23], #0x2\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
"b 23f\n"
"20:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v14.h }[4], [x23], #0x2\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
"b 23f\n"
"21:" // Oddments: Load input (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v14.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v14.h }[2], [x23], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"b 23f\n"
"22:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v14.h }[0], [x23], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x22, [x9, #0x60]\n"
+ "ldr x20, [x13, #0x60]\n"
"fmla v30.8h, v3.8h, v14.8h\n"
- "add x22, x22, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 25f\n"
- "ld1 { v15.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v15.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v15.h }[6], [x22], #0x2\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
"b 27f\n"
"24:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v15.h }[4], [x22], #0x2\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
"b 27f\n"
"25:" // Oddments: Load input (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v15.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v15.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"b 27f\n"
"26:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v15.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x21, [x9, #0x68]\n"
+ "ldr x20, [x13, #0x68]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
"fmla v30.8h, v0.8h, v15.8h\n"
- "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 29f\n"
- "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[6], [x21], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 31f\n"
"28:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[4], [x21], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 31f\n"
"29:" // Oddments: Load input (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 31f\n"
"30:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x9, #0x70]\n"
+ "ldr x20, [x13, #0x70]\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"add x20, x20, x28\n"
"tbz %x[n_channels], #2, 33f\n"
@@ -599,171 +599,171 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v16.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x19, [x9, #0x78]\n"
+ "ldr x20, [x13, #0x78]\n"
"fmla v28.8h, v7.8h, v16.8h\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "add x19, x19, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 37f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v13.h }[6], [x19], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 39f\n"
"36:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v13.h }[4], [x19], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 39f\n"
"37:" // Oddments: Load input (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v13.h }[2], [x19], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 39f\n"
"38:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x19], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x26, [x9, #0x80]\n"
+ "ldr x20, [x13, #0x80]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "add x26, x26, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 41f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v12.h }[6], [x26], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 43f\n"
"40:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v12.h }[4], [x26], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 43f\n"
"41:" // Oddments: Load input (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v12.h }[2], [x26], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 43f\n"
"42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x26], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x25, [x9, #0x88]\n"
+ "ldr x20, [x13, #0x88]\n"
"fmla v29.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "add x25, x25, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 45f\n"
- "ld1 { v14.d }[0], [x25], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v14.h }[6], [x25], #0x2\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
"b 47f\n"
"44:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v14.h }[4], [x25], #0x2\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
"b 47f\n"
"45:" // Oddments: Load input (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v14.s }[0], [x25], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v14.h }[2], [x25], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"b 47f\n"
"46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v14.h }[0], [x25], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x24, [x9, #0x90]\n"
+ "ldr x20, [x13, #0x90]\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "add x24, x24, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 49f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v15.h }[6], [x24], #0x2\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
"b 51f\n"
"48:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v15.h }[4], [x24], #0x2\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
"b 51f\n"
"49:" // Oddments: Load input (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v15.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v15.h }[2], [x24], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"b 51f\n"
"50:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v15.h }[0], [x24], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x23, [x9, #0x98]\n"
+ "ldr x20, [x13, #0x98]\n"
"fmla v30.8h, v6.8h, v15.8h\n"
- "add x23, x23, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 53f\n"
- "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[6], [x23], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 55f\n"
"52:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[4], [x23], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 55f\n"
"53:" // Oddments: Load input (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 55f\n"
"54:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x22, [x9, #0xa0]\n"
+ "ldr x20, [x13, #0xa0]\n"
"fmla v29.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "add x22, x22, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 57f\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v13.h }[6], [x22], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 59f\n"
"56:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v13.h }[4], [x22], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 59f\n"
"57:" // Oddments: Load input (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 58f\n"
- "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v13.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 59f\n"
"58:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x21, [x9, #0xa8]\n"
+ "ldr x20, [x13, #0xa8]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 61f\n"
- "ld1 { v16.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v16.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
"b 63f\n"
"60:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v16.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
"b 63f\n"
"61:" // Oddments: Load input (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 62f\n"
- "ld1 { v16.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v16.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"b 63f\n"
"62:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v16.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x9, #0xb0]\n"
+ "ldr x20, [x13, #0xb0]\n"
"fmla v30.8h, v5.8h, v16.8h\n"
"fmla v31.8h, v3.8h, v16.8h\n"
"add x20, x20, x28\n"
@@ -787,52 +787,52 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v14.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x19, [x9, #0xb8]\n"
+ "ldr x20, [x13, #0xb8]\n"
"fmla v31.8h, v7.8h, v14.8h\n"
- "add x19, x19, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 69f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
- "ld1 { v15.s }[2], [x19], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v15.h }[6], [x19], #0x2\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
"b 71f\n"
"68:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v15.h }[4], [x19], #0x2\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
"b 71f\n"
"69:" // Oddments: Load input (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 70f\n"
- "ld1 { v15.s }[0], [x19], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v15.h }[2], [x19], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
"b 71f\n"
"70:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v15.h }[0], [x19], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (4, 2): Bit 2: End
- "ldr x26, [x9, #0xc0]\n"
+ "ldr x20, [x13, #0xc0]\n"
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v15.8h\n"
- "add x26, x26, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #2, 73f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v11.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 75f\n"
"72:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v11.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 75f\n"
"73:" // Oddments: Load input (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 74f\n"
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v11.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 75f\n"
"74:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v11.8h\n"
"fmax v28.8h, v28.8h, v19.8h\n"
@@ -844,52 +844,50 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v18.8h\n"
"fmin v31.8h, v31.8h, v18.8h\n"
"tbz %x[n_channels], #2, 77f\n"
- "st1 { v28.d }[0], [x13], #0x8\n"
- "st1 { v29.d }[0], [x12], #0x8\n"
- "st1 { v30.d }[0], [x11], #0x8\n"
- "st1 { v31.d }[0], [x10], #0x8\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
- "st1 { v28.s }[2], [x13], #0x4\n"
- "st1 { v29.s }[2], [x12], #0x4\n"
- "st1 { v30.s }[2], [x11], #0x4\n"
- "st1 { v31.s }[2], [x10], #0x4\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[6], [x13], #0x2\n"
- "st1 { v29.h }[6], [x12], #0x2\n"
- "st1 { v30.h }[6], [x11], #0x2\n"
- "st1 { v31.h }[6], [x10], #0x2\n"
+ "st1 { v28.h }[6], [x12], #0x2\n"
+ "st1 { v29.h }[6], [x11], #0x2\n"
+ "st1 { v30.h }[6], [x10], #0x2\n"
+ "st1 { v31.h }[6], [x9], #0x2\n"
"b 79f\n"
"76:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[4], [x13], #0x2\n"
- "st1 { v29.h }[4], [x12], #0x2\n"
- "st1 { v30.h }[4], [x11], #0x2\n"
- "st1 { v31.h }[4], [x10], #0x2\n"
+ "st1 { v28.h }[4], [x12], #0x2\n"
+ "st1 { v29.h }[4], [x11], #0x2\n"
+ "st1 { v30.h }[4], [x10], #0x2\n"
+ "st1 { v31.h }[4], [x9], #0x2\n"
"b 79f\n"
"77:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 78f\n"
- "st1 { v28.s }[0], [x13], #0x4\n"
- "st1 { v29.s }[0], [x12], #0x4\n"
- "st1 { v30.s }[0], [x11], #0x4\n"
- "st1 { v31.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[2], [x13], #0x2\n"
- "st1 { v29.h }[2], [x12], #0x2\n"
- "st1 { v30.h }[2], [x11], #0x2\n"
- "st1 { v31.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x11], #0x2\n"
+ "st1 { v30.h }[2], [x10], #0x2\n"
+ "st1 { v31.h }[2], [x9], #0x2\n"
"b 79f\n"
"78:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "st1 { v28.h }[0], [x13], #0x2\n"
- "st1 { v29.h }[0], [x12], #0x2\n"
- "st1 { v30.h }[0], [x11], #0x2\n"
- "st1 { v31.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x11], #0x2\n"
+ "st1 { v30.h }[0], [x10], #0x2\n"
+ "st1 { v31.h }[0], [x9], #0x2\n"
"79:" // Oddments: Store: Bit 2: End
-
"80:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index b08059db0a..8807f5d306 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,403 +87,403 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x27, #0x0\n"
"mov x26, #0x0\n"
- "mov x25, #0x0\n"
"1:" // Tile loop
- "str x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x22, #0x2\n"
- "mov x21, #0x2\n"
- "str x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x2\n"
+ "mov x25, #0x2\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x20, x26, x24\n" // offset = tile_i * ld_input_row
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x20, x25, x3, x20\n" // offset += tile_j * ld_input_col
- "ldr x4, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x3, x3, #0x1\n"
- "mul x19, x26, x23\n" // offset = tile_i * ld_output_row
- "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x7, x3, x3\n"
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x8, x5, x24, LSL #1\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x19, x25, x4, x19\n" // offset += tile_j * ld_output_col
- "add x16, x8, x24, LSL #1\n"
- "mov x22, #0x10\n" // cntb _, ALL, #1
- "mul x19, x19, x21\n" // offset *= output_tile_size
- "lsr x21, %x[n_channels], #0x3\n"
- "add x15, x16, x24, LSL #1\n"
- "add x14, x7, x3\n"
- "add x13, x15, x24, LSL #1\n"
- "add x12, x14, x3\n"
- "add x6, x6, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x2, x2, #0x1\n"
+ "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x6, x2, x2\n"
+ "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
+ "add x4, x4, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x7, x4, x24, LSL #1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
+ "add x17, x7, x24, LSL #1\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x16, x17, x24, LSL #1\n"
+ "add x15, x6, x2\n"
+ "add x14, x16, x24, LSL #1\n"
+ "add x13, x15, x2\n"
+ "add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
- "add x11, x13, x24, LSL #1\n"
- "add x10, x12, x3\n"
- "add x9, x6, x23, LSL #1\n"
- "lsl x4, x4, #0x1\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
- "cbz x21, 4f\n"
- "ldr q16, [x17, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
- "ldr q0, [x17, #0x10]\n"
- "ldr q1, [x17, #0x20]\n"
- "ldr q2, [x17, #0x30]\n"
- "ldr q3, [x17, #0x40]\n"
- "ldr q4, [x17, #0x50]\n"
- "ld1 { v5.8h }, [x5]\n"
- "add x17, x17, #0x60\n"
- "ldr q6, [x5, x3]\n"
- "ld1 { v7.8h }, [x8]\n"
- "ldr q8, [x8, x3]\n"
- "ldr q9, [x5, x7]\n"
- "ldr q13, [x8, x7]\n"
- "ldr q11, [x5, x14]\n"
- "ldr q12, [x5, x12]\n"
- "ldr q10, [x8, x10]\n"
- "ld1 { v14.8h }, [x16]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x13, x2\n"
+ "add x10, x5, x21, LSL #1\n"
+ "lsl x3, x3, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q16, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x8, x8, #0x60\n"
+ "ld1 { v5.8h }, [x4]\n"
+ "ldr q6, [x4, x2]\n"
+ "ld1 { v7.8h }, [x7]\n"
+ "ldr q8, [x7, x2]\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q13, [x7, x6]\n"
+ "ldr q11, [x4, x15]\n"
+ "ldr q12, [x4, x13]\n"
+ "ldr q10, [x7, x11]\n"
+ "ld1 { v14.8h }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x7, x15]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x8, x14]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x17, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
+ "ldr q0, [x8, #0x0]\n"
+ "ldr q16, [x8, #0x140]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x7, x13]\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q6, [x8, x12]\n"
- "add x8, x8, #0x10\n"
+ "add x7, x7, #0x10\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x17, #0x10]\n"
- "add x19, x19, #0x10\n"
+ "ldr q1, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
"fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x4, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q9, [x5, x10]\n"
- "add x5, x5, #0x10\n"
+ "add x4, x4, #0x10\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x17, #0x20]\n"
+ "ldr q2, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x2]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x16, x3]\n"
- "ldr q16, [x17, #0x140]\n"
+ "add x21, x21, #0x10\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x17, #0x30]\n"
+ "ldr q3, [x8, #0x30]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x6]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q12, [x16, x7]\n"
- "ldr q9, [x16, x14]\n"
+ "ldr q9, [x17, x15]\n"
"fmla v30.8h, v4.8h, v6.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x17, #0x40]\n"
+ "ldr q4, [x8, #0x40]\n"
"fmla v28.8h, v0.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x7]\n"
"fmla v29.8h, v0.8h, v8.8h\n"
- "ld1 { v7.8h }, [x8]\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x17, #0x50]\n"
+ "ldr q0, [x8, #0x50]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x11]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q8, [x16, x10]\n"
"fmla v30.8h, v1.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x17, #0x60]\n"
+ "ldr q1, [x8, #0x60]\n"
"fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x13]\n"
"fmla v29.8h, v2.8h, v5.8h\n"
- "ldr q13, [x16, x12]\n"
- "add x16, x16, #0x10\n"
+ "add x17, x17, #0x10\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x17, #0x70]\n"
+ "ldr q2, [x8, #0x70]\n"
"fmla v28.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
"fmla v29.8h, v3.8h, v6.8h\n"
- "ld1 { v5.8h }, [x15]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x17, #0x80]\n"
+ "ldr q3, [x8, #0x80]\n"
"fmla v28.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x2]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q6, [x15, x3]\n"
- "ldr q10, [x15, x7]\n"
+ "ldr q10, [x16, x6]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x17, #0x90]\n"
+ "ldr q4, [x8, #0x90]\n"
"fmla v28.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x11]\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q14, [x15, x10]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
"fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x17, #0xa0]\n"
+ "ldr q0, [x8, #0xa0]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x15]\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q11, [x15, x14]\n"
"fmla v30.8h, v1.8h, v6.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x17, #0xb0]\n"
+ "ldr q1, [x8, #0xb0]\n"
"fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x13]\n"
"fmla v29.8h, v2.8h, v9.8h\n"
- "ldr q12, [x15, x12]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x17, #0xc0]\n"
+ "ldr q2, [x8, #0xc0]\n"
"fmla v28.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x14]\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ld1 { v9.8h }, [x13]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x17, #0xd0]\n"
+ "ldr q3, [x8, #0xd0]\n"
"fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x14, x2]\n"
"fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q13, [x13, x3]\n"
- "ldr q8, [x13, x12]\n"
+ "ldr q8, [x14, x13]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x17, #0xe0]\n"
+ "ldr q4, [x8, #0xe0]\n"
"fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x14, x6]\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x13, x7]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x17, #0xf0]\n"
+ "ldr q0, [x8, #0xf0]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x14, x15]\n"
"fmla v29.8h, v1.8h, v10.8h\n"
- "ldr q6, [x13, x14]\n"
"fmla v30.8h, v1.8h, v13.8h\n"
"fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x17, #0x100]\n"
+ "ldr q1, [x8, #0x100]\n"
"fmla v28.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x14, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q10, [x13, x10]\n"
- "add x13, x13, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v30.8h, v2.8h, v5.8h\n"
"fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x17, #0x110]\n"
+ "ldr q2, [x8, #0x110]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ld1 { v11.8h }, [x11]\n"
"fmla v30.8h, v3.8h, v6.8h\n"
"fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x17, #0x120]\n"
+ "ldr q3, [x8, #0x120]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x2]\n"
"fmla v29.8h, v4.8h, v14.8h\n"
- "ldr q12, [x11, x3]\n"
- "ld1 { v14.8h }, [x16]\n"
+ "ld1 { v14.8h }, [x17]\n"
"fmla v30.8h, v4.8h, v8.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x17, #0x130]\n"
+ "ldr q4, [x8, #0x130]\n"
"fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x12, x6]\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "ldr q9, [x11, x7]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x12, x15]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
- "ldr q11, [x11, x14]\n"
- "ldr q0, [x17, #0x150]\n"
+ "ldr q0, [x8, #0x150]\n"
"fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q13, [x7, x6]\n"
"fmla v29.8h, v1.8h, v5.8h\n"
- "ldr q13, [x8, x7]\n"
"fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "ldr q12, [x11, x12]\n"
- "ldr q1, [x17, #0x160]\n"
+ "ldr q1, [x8, #0x160]\n"
"fmla v28.8h, v2.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x4]\n"
"fmla v29.8h, v2.8h, v6.8h\n"
- "ld1 { v5.8h }, [x5]\n"
"fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x12, x11]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q9, [x11, x10]\n"
- "add x11, x11, #0x10\n"
+ "ldr q2, [x8, #0x170]\n"
"fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x4, x2]\n"
"fmla v29.8h, v3.8h, v8.8h\n"
- "ldr q6, [x5, x3]\n"
- "ldr q2, [x17, #0x170]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x4, x15]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q11, [x5, x14]\n"
- "ldr q3, [x17, #0x180]\n"
+ "ldr q3, [x8, #0x180]\n"
"fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x7, x2]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "ldr q8, [x8, x3]\n"
+ "ldr q10, [x7, x11]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x4, x13]\n"
"fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q4, [x8, #0x190]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "ldr q9, [x5, x7]\n"
+ "add x8, x8, #0x1a0\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "ldr q12, [x5, x12]\n"
- "ldr q10, [x8, x10]\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x6]\n"
- "ldr q4, [x17, #0x190]\n"
+ "st1 { v28.8h }, [x5]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x6, x4]\n"
- "add x6, x6, #0x10\n"
- "st1 { v30.8h }, [x9]\n"
- "add x17, x17, #0x1a0\n"
- "str q31, [x9, x4]\n"
- "add x9, x9, #0x10\n"
+ "str q29, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.8h }, [x10]\n"
+ "str q31, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x7, x15]\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x8, x14]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x7, x13]\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q6, [x8, x12]\n"
- "add x8, x8, #0x10\n"
+ "add x7, x7, #0x10\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q1, [x17, #0x10]\n"
+ "ldr q1, [x8, #0x10]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x4, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q9, [x5, x10]\n"
- "add x5, x5, #0x10\n"
+ "add x4, x4, #0x10\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v31.8h, v2.8h, v5.8h\n"
- "ldr q2, [x17, #0x20]\n"
+ "ldr q2, [x8, #0x20]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x2]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x16, x3]\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"fmla v31.8h, v3.8h, v6.8h\n"
- "ldr q3, [x17, #0x30]\n"
+ "ldr q3, [x8, #0x30]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x6]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q12, [x16, x7]\n"
- "ldr q9, [x16, x14]\n"
+ "ldr q9, [x17, x15]\n"
"fmla v30.8h, v4.8h, v6.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x17, #0x40]\n"
+ "ldr q4, [x8, #0x40]\n"
"fmla v28.8h, v0.8h, v7.8h\n"
"fmla v29.8h, v0.8h, v8.8h\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"fmla v31.8h, v0.8h, v11.8h\n"
- "ldr q0, [x17, #0x50]\n"
+ "ldr q0, [x8, #0x50]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x11]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q8, [x16, x10]\n"
"fmla v30.8h, v1.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "ldr q1, [x17, #0x60]\n"
+ "ldr q1, [x8, #0x60]\n"
"fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x13]\n"
"fmla v29.8h, v2.8h, v5.8h\n"
- "ldr q13, [x16, x12]\n"
- "add x16, x16, #0x10\n"
+ "add x17, x17, #0x10\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q2, [x17, #0x70]\n"
+ "ldr q2, [x8, #0x70]\n"
"fmla v28.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
"fmla v29.8h, v3.8h, v6.8h\n"
- "ld1 { v5.8h }, [x15]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
- "ldr q3, [x17, #0x80]\n"
+ "ldr q3, [x8, #0x80]\n"
"fmla v28.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x2]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q6, [x15, x3]\n"
- "ldr q10, [x15, x7]\n"
+ "ldr q10, [x16, x6]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v8.8h\n"
- "ldr q4, [x17, #0x90]\n"
+ "ldr q4, [x8, #0x90]\n"
"fmla v28.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x11]\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q14, [x15, x10]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
"fmla v31.8h, v0.8h, v6.8h\n"
- "ldr q0, [x17, #0xa0]\n"
+ "ldr q0, [x8, #0xa0]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x15]\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q11, [x15, x14]\n"
"fmla v30.8h, v1.8h, v6.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
- "ldr q1, [x17, #0xb0]\n"
+ "ldr q1, [x8, #0xb0]\n"
"fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x13]\n"
"fmla v29.8h, v2.8h, v9.8h\n"
- "ldr q12, [x15, x12]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q2, [x17, #0xc0]\n"
+ "ldr q2, [x8, #0xc0]\n"
"fmla v28.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x14]\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ld1 { v9.8h }, [x13]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr q3, [x17, #0xd0]\n"
+ "ldr q3, [x8, #0xd0]\n"
"fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x14, x2]\n"
"fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q13, [x13, x3]\n"
- "ldr q8, [x13, x12]\n"
+ "ldr q8, [x14, x13]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v14.8h\n"
- "ldr q4, [x17, #0xe0]\n"
+ "ldr q4, [x8, #0xe0]\n"
"fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x14, x6]\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x13, x7]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v13.8h\n"
- "ldr q0, [x17, #0xf0]\n"
+ "ldr q0, [x8, #0xf0]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x14, x15]\n"
"fmla v29.8h, v1.8h, v10.8h\n"
- "ldr q6, [x13, x14]\n"
"fmla v30.8h, v1.8h, v13.8h\n"
"fmla v31.8h, v1.8h, v5.8h\n"
- "ldr q1, [x17, #0x100]\n"
+ "ldr q1, [x8, #0x100]\n"
"fmla v28.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x14, x11]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q10, [x13, x10]\n"
- "add x13, x13, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v30.8h, v2.8h, v5.8h\n"
"fmla v31.8h, v2.8h, v6.8h\n"
- "ldr q2, [x17, #0x110]\n"
+ "ldr q2, [x8, #0x110]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ld1 { v11.8h }, [x11]\n"
"fmla v30.8h, v3.8h, v6.8h\n"
"fmla v31.8h, v3.8h, v8.8h\n"
- "ldr q3, [x17, #0x120]\n"
+ "ldr q3, [x8, #0x120]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x2]\n"
"fmla v29.8h, v4.8h, v14.8h\n"
- "ldr q12, [x11, x3]\n"
"fmla v30.8h, v4.8h, v8.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "ldr q4, [x17, #0x130]\n"
- "add x17, x17, #0x140\n"
+ "ldr q4, [x8, #0x130]\n"
+ "add x8, x8, #0x140\n"
"fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x12, x6]\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "ldr q9, [x11, x7]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x12, x15]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
- "ldr q11, [x11, x14]\n"
"fmla v28.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "ldr q12, [x11, x12]\n"
"fmla v28.8h, v2.8h, v5.8h\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x12, x11]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q9, [x11, x10]\n"
- "add x11, x11, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v28.8h, v3.8h, v6.8h\n"
"fmla v29.8h, v3.8h, v8.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
@@ -498,120 +498,120 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "st1 { v28.8h }, [x6]\n"
+ "st1 { v28.8h }, [x5]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x6, x4]\n"
- "add x6, x6, #0x10\n"
- "st1 { v30.8h }, [x9]\n"
- "str q31, [x9, x4]\n"
- "add x9, x9, #0x10\n"
+ "str q29, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.8h }, [x10]\n"
+ "str q31, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 117f\n"
- "ldr q16, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
- "ldr q1, [x17, #0x20]\n"
- "ldr q2, [x17, #0x30]\n"
- "add x28, x5, XZR\n"
- "add x27, x5, x3\n"
- "ldr q3, [x17, #0x40]\n"
- "ldr q4, [x17, #0x50]\n"
- "add x26, x8, XZR\n"
- "add x25, x8, x3\n"
- "add x24, x5, x7\n"
- "add x23, x8, x7\n"
- "add x22, x5, x14\n"
- "add x21, x5, x12\n"
- "add x20, x8, x10\n"
- "add x19, x16, XZR\n"
- "add x17, x17, #0x60\n"
+ "ldr q16, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "add x9, x4, XZR\n"
+ "add x28, x4, x2\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "add x27, x7, XZR\n"
+ "add x26, x7, x2\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x25, x4, x6\n"
+ "add x24, x7, x6\n"
+ "add x23, x4, x15\n"
+ "add x22, x4, x13\n"
+ "add x21, x7, x11\n"
+ "add x20, x17, XZR\n"
+ "add x8, x8, #0x60\n"
"tbz %x[n_channels], #2, 6f\n"
- "ldr d5, [x28], #0x8\n"
- "ldr d6, [x27], #0x8\n"
- "ldr d7, [x26], #0x8\n"
- "ldr d8, [x25], #0x8\n"
- "ldr d9, [x24], #0x8\n"
- "ldr d13, [x23], #0x8\n"
- "ldr d11, [x22], #0x8\n"
- "ldr d12, [x21], #0x8\n"
- "ldr d10, [x20], #0x8\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d5, [x9], #0x8\n"
+ "ldr d6, [x28], #0x8\n"
+ "ldr d7, [x27], #0x8\n"
+ "ldr d8, [x26], #0x8\n"
+ "ldr d9, [x25], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d11, [x23], #0x8\n"
+ "ldr d12, [x22], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v5.s }[2], [x28], #0x4\n"
- "ld1 { v6.s }[2], [x27], #0x4\n"
- "ld1 { v7.s }[2], [x26], #0x4\n"
- "ld1 { v8.s }[2], [x25], #0x4\n"
- "ld1 { v9.s }[2], [x24], #0x4\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
- "ld1 { v11.s }[2], [x22], #0x4\n"
- "ld1 { v12.s }[2], [x21], #0x4\n"
- "ld1 { v10.s }[2], [x20], #0x4\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v5.h }[6], [x28]\n"
- "ld1 { v6.h }[6], [x27]\n"
- "ld1 { v7.h }[6], [x26]\n"
- "ld1 { v8.h }[6], [x25]\n"
- "ld1 { v9.h }[6], [x24]\n"
- "ld1 { v13.h }[6], [x23]\n"
- "ld1 { v11.h }[6], [x22]\n"
- "ld1 { v12.h }[6], [x21]\n"
- "ld1 { v10.h }[6], [x20]\n"
- "ld1 { v14.h }[6], [x19]\n"
+ "ld1 { v5.h }[6], [x9]\n"
+ "ld1 { v6.h }[6], [x28]\n"
+ "ld1 { v7.h }[6], [x27]\n"
+ "ld1 { v8.h }[6], [x26]\n"
+ "ld1 { v9.h }[6], [x25]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v11.h }[6], [x23]\n"
+ "ld1 { v12.h }[6], [x22]\n"
+ "ld1 { v10.h }[6], [x21]\n"
+ "ld1 { v14.h }[6], [x20]\n"
"b 8f\n"
"5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v5.h }[4], [x28]\n"
- "ld1 { v6.h }[4], [x27]\n"
- "ld1 { v7.h }[4], [x26]\n"
- "ld1 { v8.h }[4], [x25]\n"
- "ld1 { v9.h }[4], [x24]\n"
- "ld1 { v13.h }[4], [x23]\n"
- "ld1 { v11.h }[4], [x22]\n"
- "ld1 { v12.h }[4], [x21]\n"
- "ld1 { v10.h }[4], [x20]\n"
- "ld1 { v14.h }[4], [x19]\n"
+ "ld1 { v5.h }[4], [x9]\n"
+ "ld1 { v6.h }[4], [x28]\n"
+ "ld1 { v7.h }[4], [x27]\n"
+ "ld1 { v8.h }[4], [x26]\n"
+ "ld1 { v9.h }[4], [x25]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v11.h }[4], [x23]\n"
+ "ld1 { v12.h }[4], [x22]\n"
+ "ld1 { v10.h }[4], [x21]\n"
+ "ld1 { v14.h }[4], [x20]\n"
"b 8f\n"
"6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 7f\n"
- "ldr s5, [x28], #0x4\n"
- "ldr s6, [x27], #0x4\n"
- "ldr s7, [x26], #0x4\n"
- "ldr s8, [x25], #0x4\n"
- "ldr s9, [x24], #0x4\n"
- "ldr s13, [x23], #0x4\n"
- "ldr s11, [x22], #0x4\n"
- "ldr s12, [x21], #0x4\n"
- "ldr s10, [x20], #0x4\n"
- "ldr s14, [x19], #0x4\n"
+ "ldr s5, [x9], #0x4\n"
+ "ldr s6, [x28], #0x4\n"
+ "ldr s7, [x27], #0x4\n"
+ "ldr s8, [x26], #0x4\n"
+ "ldr s9, [x25], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
+ "ldr s11, [x23], #0x4\n"
+ "ldr s12, [x22], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v5.h }[2], [x28]\n"
- "ld1 { v6.h }[2], [x27]\n"
- "ld1 { v7.h }[2], [x26]\n"
- "ld1 { v8.h }[2], [x25]\n"
- "ld1 { v9.h }[2], [x24]\n"
- "ld1 { v13.h }[2], [x23]\n"
- "ld1 { v11.h }[2], [x22]\n"
- "ld1 { v12.h }[2], [x21]\n"
- "ld1 { v10.h }[2], [x20]\n"
- "ld1 { v14.h }[2], [x19]\n"
+ "ld1 { v5.h }[2], [x9]\n"
+ "ld1 { v6.h }[2], [x28]\n"
+ "ld1 { v7.h }[2], [x27]\n"
+ "ld1 { v8.h }[2], [x26]\n"
+ "ld1 { v9.h }[2], [x25]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v11.h }[2], [x23]\n"
+ "ld1 { v12.h }[2], [x22]\n"
+ "ld1 { v10.h }[2], [x21]\n"
+ "ld1 { v14.h }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h5, [x28, #0x0]\n"
- "ldr h6, [x27, #0x0]\n"
- "ldr h7, [x26, #0x0]\n"
- "ldr h8, [x25, #0x0]\n"
- "ldr h9, [x24, #0x0]\n"
- "ldr h13, [x23, #0x0]\n"
- "ldr h11, [x22, #0x0]\n"
- "ldr h12, [x21, #0x0]\n"
- "ldr h10, [x20, #0x0]\n"
- "ldr h14, [x19, #0x0]\n"
+ "ldr h5, [x9, #0x0]\n"
+ "ldr h6, [x28, #0x0]\n"
+ "ldr h7, [x27, #0x0]\n"
+ "ldr h8, [x26, #0x0]\n"
+ "ldr h9, [x25, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h11, [x23, #0x0]\n"
+ "ldr h12, [x22, #0x0]\n"
+ "ldr h10, [x21, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "add x19, x8, x14\n"
+ "add x20, x7, x15\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
@@ -622,676 +622,676 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"tbz %x[n_channels], #2, 10f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v5.h }[6], [x19]\n"
+ "ld1 { v5.h }[6], [x20]\n"
"b 12f\n"
"9:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v5.h }[4], [x19]\n"
+ "ld1 { v5.h }[4], [x20]\n"
"b 12f\n"
"10:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 11f\n"
- "ldr s5, [x19], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v5.h }[2], [x19]\n"
+ "ld1 { v5.h }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h5, [x19, #0x0]\n"
+ "ldr h5, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v31.8h, v2.8h, v5.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x19, x8, x12\n"
+ "add x20, x7, x13\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"tbz %x[n_channels], #2, 14f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v6.h }[6], [x19]\n"
+ "ld1 { v6.h }[6], [x20]\n"
"b 16f\n"
"13:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v6.h }[4], [x19]\n"
+ "ld1 { v6.h }[4], [x20]\n"
"b 16f\n"
"14:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 15f\n"
- "ldr s6, [x19], #0x4\n"
+ "ldr s6, [x20], #0x4\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v6.h }[2], [x19]\n"
+ "ld1 { v6.h }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h6, [x19, #0x0]\n"
+ "ldr h6, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v31.8h, v3.8h, v6.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x19, x5, x10\n"
+ "add x20, x4, x11\n"
"tbz %x[n_channels], #2, 18f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 20f\n"
"17:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 20f\n"
"18:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 19f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: End
+ "ldr q0, [x8, #0x0]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
"fmla v30.8h, v4.8h, v6.8h\n"
- "ldr q0, [x17, #0x0]\n"
- "add x19, x16, x3\n"
+ "add x20, x17, x2\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v7.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"fmla v29.8h, v0.8h, v8.8h\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"tbz %x[n_channels], #2, 22f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 24f\n"
"21:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 24f\n"
"22:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v8.8h\n"
- "add x19, x16, x7\n"
+ "add x20, x17, x6\n"
"fmla v29.8h, v1.8h, v13.8h\n"
"fmla v30.8h, v1.8h, v11.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 26f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 28f\n"
"25:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 28f\n"
"26:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 27f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x19, x16, x14\n"
+ "add x20, x17, x15\n"
"fmla v29.8h, v2.8h, v5.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 30f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 32f\n"
"29:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 32f\n"
"30:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 31f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
"fmla v28.8h, v3.8h, v5.8h\n"
- "add x19, x16, x12\n"
+ "add x20, x17, x13\n"
"fmla v29.8h, v3.8h, v6.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 34f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 36f\n"
"33:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 36f\n"
"34:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 35f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v6.8h\n"
- "add x19, x16, x10\n"
+ "add x20, x17, x11\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 38f\n"
- "ldr d8, [x19], #0x8\n"
+ "ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v8.h }[6], [x19]\n"
+ "ld1 { v8.h }[6], [x20]\n"
"b 40f\n"
"37:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v8.h }[4], [x19]\n"
+ "ld1 { v8.h }[4], [x20]\n"
"b 40f\n"
"38:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 39f\n"
- "ldr s8, [x19], #0x4\n"
+ "ldr s8, [x20], #0x4\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v8.h }[2], [x19]\n"
+ "ld1 { v8.h }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h8, [x19, #0x0]\n"
+ "ldr h8, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.8h, v4.8h, v8.8h\n"
"fmla v28.8h, v0.8h, v14.8h\n"
- "add x19, x15, XZR\n"
+ "add x20, x16, XZR\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 42f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v5.h }[6], [x19]\n"
+ "ld1 { v5.h }[6], [x20]\n"
"b 44f\n"
"41:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v5.h }[4], [x19]\n"
+ "ld1 { v5.h }[4], [x20]\n"
"b 44f\n"
"42:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 43f\n"
- "ldr s5, [x19], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v5.h }[2], [x19]\n"
+ "ld1 { v5.h }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h5, [x19, #0x0]\n"
+ "ldr h5, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v0.8h, v5.8h\n"
- "add x19, x15, x3\n"
+ "add x20, x16, x2\n"
"tbz %x[n_channels], #2, 46f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v6.h }[6], [x19]\n"
+ "ld1 { v6.h }[6], [x20]\n"
"b 48f\n"
"45:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v6.h }[4], [x19]\n"
+ "ld1 { v6.h }[4], [x20]\n"
"b 48f\n"
"46:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 47f\n"
- "ldr s6, [x19], #0x4\n"
+ "ldr s6, [x20], #0x4\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v6.h }[2], [x19]\n"
+ "ld1 { v6.h }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h6, [x19, #0x0]\n"
+ "ldr h6, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.8h, v0.8h, v6.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
- "add x19, x15, x7\n"
+ "add x20, x16, x6\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v30.8h, v1.8h, v6.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 50f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 52f\n"
"49:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 52f\n"
"50:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 51f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"fmla v28.8h, v2.8h, v12.8h\n"
- "add x19, x15, x14\n"
+ "add x20, x16, x15\n"
"fmla v29.8h, v2.8h, v9.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 54f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 56f\n"
"53:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 56f\n"
"54:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 55f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v9.8h\n"
- "add x19, x15, x12\n"
+ "add x20, x16, x13\n"
"fmla v29.8h, v3.8h, v13.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 58f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 60f\n"
"57:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 60f\n"
"58:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 59f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v4.8h, v13.8h\n"
- "add x19, x15, x10\n"
+ "add x20, x16, x11\n"
"fmla v29.8h, v4.8h, v8.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 62f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v14.h }[6], [x19]\n"
+ "ld1 { v14.h }[6], [x20]\n"
"b 64f\n"
"61:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v14.h }[4], [x19]\n"
+ "ld1 { v14.h }[4], [x20]\n"
"b 64f\n"
"62:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 63f\n"
- "ldr s14, [x19], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v14.h }[2], [x19]\n"
+ "ld1 { v14.h }[2], [x20]\n"
"b 64f\n"
"63:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h14, [x19, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.8h, v4.8h, v14.8h\n"
"fmla v28.8h, v0.8h, v5.8h\n"
- "add x19, x13, XZR\n"
+ "add x20, x14, XZR\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 66f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 68f\n"
"65:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 68f\n"
"66:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 67f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 68f\n"
"67:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v30.8h, v0.8h, v9.8h\n"
- "add x19, x13, x3\n"
+ "add x20, x14, x2\n"
"tbz %x[n_channels], #2, 70f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v13.h }[6], [x19]\n"
+ "ld1 { v13.h }[6], [x20]\n"
"b 72f\n"
"69:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v13.h }[4], [x19]\n"
+ "ld1 { v13.h }[4], [x20]\n"
"b 72f\n"
"70:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 71f\n"
- "ldr s13, [x19], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
"tbz %x[n_channels], #0, 72f\n"
- "ld1 { v13.h }[2], [x19]\n"
+ "ld1 { v13.h }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h13, [x19, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
"72:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.8h, v0.8h, v13.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
- "add x19, x13, x7\n"
+ "add x20, x14, x6\n"
"fmla v29.8h, v1.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v13.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 74f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v5.h }[6], [x19]\n"
+ "ld1 { v5.h }[6], [x20]\n"
"b 76f\n"
"73:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v5.h }[4], [x19]\n"
+ "ld1 { v5.h }[4], [x20]\n"
"b 76f\n"
"74:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 75f\n"
- "ldr s5, [x19], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
"tbz %x[n_channels], #0, 76f\n"
- "ld1 { v5.h }[2], [x19]\n"
+ "ld1 { v5.h }[2], [x20]\n"
"b 76f\n"
"75:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h5, [x19, #0x0]\n"
+ "ldr h5, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
"fmla v28.8h, v2.8h, v10.8h\n"
- "add x19, x13, x14\n"
+ "add x20, x14, x15\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v5.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 78f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #1, 77f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v6.h }[6], [x19]\n"
+ "ld1 { v6.h }[6], [x20]\n"
"b 80f\n"
"77:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v6.h }[4], [x19]\n"
+ "ld1 { v6.h }[4], [x20]\n"
"b 80f\n"
"78:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 79f\n"
- "ldr s6, [x19], #0x4\n"
+ "ldr s6, [x20], #0x4\n"
"tbz %x[n_channels], #0, 80f\n"
- "ld1 { v6.h }[2], [x19]\n"
+ "ld1 { v6.h }[2], [x20]\n"
"b 80f\n"
"79:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h6, [x19, #0x0]\n"
+ "ldr h6, [x20, #0x0]\n"
"80:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.8h, v2.8h, v6.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x19, x13, x12\n"
+ "add x20, x14, x13\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v6.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 82f\n"
- "ldr d8, [x19], #0x8\n"
+ "ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #1, 81f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v8.h }[6], [x19]\n"
+ "ld1 { v8.h }[6], [x20]\n"
"b 84f\n"
"81:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v8.h }[4], [x19]\n"
+ "ld1 { v8.h }[4], [x20]\n"
"b 84f\n"
"82:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 83f\n"
- "ldr s8, [x19], #0x4\n"
+ "ldr s8, [x20], #0x4\n"
"tbz %x[n_channels], #0, 84f\n"
- "ld1 { v8.h }[2], [x19]\n"
+ "ld1 { v8.h }[2], [x20]\n"
"b 84f\n"
"83:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h8, [x19, #0x0]\n"
+ "ldr h8, [x20, #0x0]\n"
"84:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.8h, v3.8h, v8.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x19, x13, x10\n"
+ "add x20, x14, x11\n"
"fmla v29.8h, v4.8h, v14.8h\n"
"fmla v30.8h, v4.8h, v8.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 86f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 85f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[6], [x19]\n"
+ "ld1 { v10.h }[6], [x20]\n"
"b 88f\n"
"85:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[4], [x19]\n"
+ "ld1 { v10.h }[4], [x20]\n"
"b 88f\n"
"86:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 87f\n"
- "ldr s10, [x19], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
"tbz %x[n_channels], #0, 88f\n"
- "ld1 { v10.h }[2], [x19]\n"
+ "ld1 { v10.h }[2], [x20]\n"
"b 88f\n"
"87:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h10, [x19, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
"88:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v9.8h\n"
- "add x19, x11, XZR\n"
+ "add x20, x12, XZR\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 90f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 89f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 92f\n"
"89:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 92f\n"
"90:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 91f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 92f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 92f\n"
"91:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"92:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x19, x11, x3\n"
+ "add x20, x12, x2\n"
"tbz %x[n_channels], #2, 94f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 93f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 96f\n"
"93:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 96f\n"
"94:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 95f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 96f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 96f\n"
"95:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"96:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
"fmla v28.8h, v1.8h, v13.8h\n"
- "add x19, x11, x7\n"
+ "add x20, x12, x6\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 98f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 97f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 100f\n"
"97:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 100f\n"
"98:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 99f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 100f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 100f\n"
"99:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"100:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v5.8h\n"
- "add x19, x11, x14\n"
+ "add x20, x12, x15\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 102f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 101f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[6], [x19]\n"
+ "ld1 { v11.h }[6], [x20]\n"
"b 104f\n"
"101:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[4], [x19]\n"
+ "ld1 { v11.h }[4], [x20]\n"
"b 104f\n"
"102:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 103f\n"
- "ldr s11, [x19], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
"tbz %x[n_channels], #0, 104f\n"
- "ld1 { v11.h }[2], [x19]\n"
+ "ld1 { v11.h }[2], [x20]\n"
"b 104f\n"
"103:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
- "ldr h11, [x19, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
"104:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v6.8h\n"
- "add x19, x11, x12\n"
+ "add x20, x12, x13\n"
"fmla v29.8h, v3.8h, v8.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 106f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 105f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[6], [x19]\n"
+ "ld1 { v12.h }[6], [x20]\n"
"b 108f\n"
"105:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[4], [x19]\n"
+ "ld1 { v12.h }[4], [x20]\n"
"b 108f\n"
"106:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 107f\n"
- "ldr s12, [x19], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
"tbz %x[n_channels], #0, 108f\n"
- "ld1 { v12.h }[2], [x19]\n"
+ "ld1 { v12.h }[2], [x20]\n"
"b 108f\n"
"107:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
- "ldr h12, [x19, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
"108:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v4.8h, v8.8h\n"
- "add x19, x11, x10\n"
+ "add x20, x12, x11\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"tbz %x[n_channels], #2, 110f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 109f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v9.h }[6], [x19]\n"
+ "ld1 { v9.h }[6], [x20]\n"
"b 112f\n"
"109:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v9.h }[4], [x19]\n"
+ "ld1 { v9.h }[4], [x20]\n"
"b 112f\n"
"110:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 111f\n"
- "ldr s9, [x19], #0x4\n"
+ "ldr s9, [x20], #0x4\n"
"tbz %x[n_channels], #0, 112f\n"
- "ld1 { v9.h }[2], [x19]\n"
+ "ld1 { v9.h }[2], [x20]\n"
"b 112f\n"
"111:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
- "ldr h9, [x19, #0x0]\n"
+ "ldr h9, [x20, #0x0]\n"
"112:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
"fmax v28.8h, v28.8h, v18.8h\n"
@@ -1303,82 +1303,82 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 114f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.d }[0], [x20], x4\n"
- "add x6, x6, #0x8\n"
- "add x9, x9, #0x8\n"
- "st1 { v30.d }[0], [x19], x4\n"
- "st1 { v29.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
+ "add x5, x5, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 113f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.s }[2], [x20], x4\n"
- "add x6, x6, #0x4\n"
- "add x9, x9, #0x4\n"
- "st1 { v30.s }[2], [x19], x4\n"
- "st1 { v29.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[2], [x21], x3\n"
+ "st1 { v30.s }[2], [x20], x3\n"
+ "add x5, x5, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 116f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.h }[6], [x20], x4\n"
- "st1 { v30.h }[6], [x19], x4\n"
- "st1 { v29.h }[6], [x20]\n"
- "st1 { v31.h }[6], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[6], [x21], x3\n"
+ "st1 { v30.h }[6], [x20], x3\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
"b 116f\n"
"113:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 116f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.h }[4], [x20], x4\n"
- "st1 { v30.h }[4], [x19], x4\n"
- "st1 { v29.h }[4], [x20]\n"
- "st1 { v31.h }[4], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[4], [x21], x3\n"
+ "st1 { v30.h }[4], [x20], x3\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
"b 116f\n"
"114:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 115f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.s }[0], [x20], x4\n"
- "st1 { v30.s }[0], [x19], x4\n"
- "add x6, x6, #0x4\n"
- "add x9, x9, #0x4\n"
- "st1 { v29.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[0], [x21], x3\n"
+ "st1 { v30.s }[0], [x20], x3\n"
+ "add x5, x5, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 116f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.h }[2], [x20], x4\n"
- "st1 { v30.h }[2], [x19], x4\n"
- "st1 { v29.h }[2], [x20]\n"
- "st1 { v31.h }[2], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[2], [x21], x3\n"
+ "st1 { v30.h }[2], [x20], x3\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
"b 116f\n"
"115:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.h }[0], [x20], x4\n"
- "st1 { v30.h }[0], [x19], x4\n"
- "st1 { v29.h }[0], [x20]\n"
- "st1 { v31.h }[0], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[0], [x21], x3\n"
+ "st1 { v30.h }[0], [x20], x3\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
"116:" // Tile loop: Oddments: Store: Bit 2: End
"117:" // Tile loop: End
- "ldr x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x25, x25, #0x1\n"
- "add x20, x26, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x25, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x26, x26, x20, LT\n"
- "csel x25, x25, XZR, LT\n"
- "cmp x26, x19\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 5b086ec1ff..a2791d277e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,422 +99,422 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "lsr x27, %x[n_channels], #0x3\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x9, %x[n_channels], #0x3\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
- "ldp x15, x14, [x21, #0x0]\n"
- "ldp x13, x12, [x21, #0x10]\n"
- "add x11, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ld1r { v18.8h }, [x20]\n"
- "ld1r { v17.8h }, [x19]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
- "sub x9, XZR, x28\n"
- "cbz x27, 3f\n"
- "ldp x26, x25, [x11, #0x0]\n"
- "ldr q5, [x26, x10]\n"
- "ldr q6, [x25, x10]\n"
- "ldp x24, x23, [x11, #0x10]\n"
- "cmp x28, x27, LSL #4\n"
- "ldp x22, x21, [x11, #0x20]\n"
- "ldp x20, x19, [x11, #0x30]\n"
- "ldp x26, x25, [x11, #0x40]\n"
+ "sub x28, XZR, x17\n"
+ "cbz x9, 3f\n"
"ldr q16, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
+ "cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "ldr q7, [x24, x10]\n"
"add x16, x16, #0x60\n"
- "ldr q8, [x23, x10]\n"
- "ldr q9, [x22, x10]\n"
- "ldr q13, [x21, x10]\n"
- "ldr q11, [x20, x10]\n"
- "ldr q12, [x19, x10]\n"
- "ldr q10, [x26, x10]\n"
- "ldr q14, [x25, x10]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldr q5, [x27, x10]\n"
+ "ldr q6, [x26, x10]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr q7, [x25, x10]\n"
+ "ldr q8, [x24, x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldr q9, [x23, x10]\n"
+ "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q11, [x21, x10]\n"
+ "ldr q12, [x20, x10]\n"
+ "ldp x27, x26, [x15, #0x40]\n"
+ "ldr q10, [x27, x10]\n"
+ "ldr q14, [x26, x10]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x24, [x11, #0x50]\n"
- "ldr q5, [x24, x10]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr q5, [x25, x10]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr x23, [x11, #0x58]\n"
- "ldr x22, [x11, #0x60]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q16, [x16, #0x140]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr x21, [x11, #0x68]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr q6, [x24, x10]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x20, [x11, #0x70]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x10]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q9, [x22, x10]\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v31.8h, v2.8h, v5.8h\n"
- "ldr x19, [x11, #0x78]\n"
"ldr q2, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x21, x10]\n"
- "ldr x26, [x11, #0x80]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"fmla v31.8h, v3.8h, v6.8h\n"
"ldr q3, [x16, #0x30]\n"
- "ldr x25, [x11, #0x88]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q12, [x20, x10]\n"
- "ldr q9, [x19, x10]\n"
+ "ldr q9, [x20, x10]\n"
"fmla v30.8h, v4.8h, v6.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"ldr q4, [x16, #0x40]\n"
- "ldr x24, [x11, #0x90]\n"
+ "ldr x26, [x15, #0x88]\n"
"fmla v28.8h, v0.8h, v7.8h\n"
"fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x23, [x11, #0x98]\n"
- "ldr x22, [x11, #0xa0]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x24, [x15, #0x98]\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"fmla v31.8h, v0.8h, v11.8h\n"
"ldr q0, [x16, #0x50]\n"
- "ldr x21, [x11, #0xa8]\n"
+ "ldr x23, [x15, #0xa0]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q8, [x25, x10]\n"
- "ldr x20, [x11, #0xb0]\n"
+ "ldr x22, [x15, #0xa8]\n"
"fmla v30.8h, v1.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"ldr q1, [x16, #0x60]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x21, [x15, #0xb0]\n"
"fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x27, x10]\n"
"fmla v29.8h, v2.8h, v5.8h\n"
- "ldr q13, [x26, x10]\n"
- "ldr x26, [x11, #0xc0]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v9.8h\n"
"ldr q2, [x16, #0x70]\n"
- "ldr x25, [x11, #0xc8]\n"
+ "ldr x27, [x15, #0xc0]\n"
"fmla v28.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.8h, v3.8h, v6.8h\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0xd0]\n"
+ "ldr x26, [x15, #0xc8]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
"ldr q3, [x16, #0x80]\n"
- "add x9, x9, #0x10\n"
+ "ldr x25, [x15, #0xd0]\n"
"fmla v28.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr q10, [x22, x10]\n"
+ "ldr q10, [x23, x10]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v8.8h\n"
"ldr q4, [x16, #0x90]\n"
- "ldr x23, [x11, #0xd8]\n"
+ "ldr x24, [x15, #0xd8]\n"
"fmla v28.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x20, x10]\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q14, [x19, x10]\n"
- "ldr x22, [x11, #0xe0]\n"
+ "ldr x23, [x15, #0xe0]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
"fmla v31.8h, v0.8h, v6.8h\n"
"ldr q0, [x16, #0xa0]\n"
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q11, [x21, x10]\n"
- "ldr x21, [x11, #0xe8]\n"
+ "ldr x22, [x15, #0xe8]\n"
"fmla v30.8h, v1.8h, v6.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"ldr q1, [x16, #0xb0]\n"
- "ldr q16, [x16, #0x140]\n"
+ "add x28, x28, #0x10\n"
"fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.8h, v2.8h, v9.8h\n"
- "ldr q12, [x20, x10]\n"
- "ldr x20, [x11, #0xf0]\n"
+ "ldr x21, [x15, #0xf0]\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"ldr q2, [x16, #0xc0]\n"
"fmla v28.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q9, [x26, x10]\n"
- "ldr x26, [x11, #0x100]\n"
+ "ldr x27, [x15, #0x100]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"ldr q3, [x16, #0xd0]\n"
"fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x26, x10]\n"
"fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q13, [x25, x10]\n"
- "ldr q8, [x22, x10]\n"
+ "ldr q8, [x23, x10]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v14.8h\n"
"ldr q4, [x16, #0xe0]\n"
- "ldr x25, [x11, #0x108]\n"
+ "ldr x26, [x15, #0x108]\n"
"fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0x110]\n"
+ "ldr x25, [x15, #0x110]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v13.8h\n"
"ldr q0, [x16, #0xf0]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.8h, v1.8h, v10.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr x23, [x11, #0x118]\n"
+ "ldr x24, [x15, #0x118]\n"
"fmla v30.8h, v1.8h, v13.8h\n"
"fmla v31.8h, v1.8h, v5.8h\n"
"ldr q1, [x16, #0x100]\n"
"fmla v28.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x22, x10]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q10, [x21, x10]\n"
"fmla v30.8h, v2.8h, v5.8h\n"
"fmla v31.8h, v2.8h, v6.8h\n"
"ldr q2, [x16, #0x110]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x20, x10]\n"
"fmla v30.8h, v3.8h, v6.8h\n"
"fmla v31.8h, v3.8h, v8.8h\n"
"ldr q3, [x16, #0x120]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x10]\n"
"fmla v29.8h, v4.8h, v14.8h\n"
- "ldr q12, [x19, x10]\n"
"fmla v30.8h, v4.8h, v8.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"ldr q4, [x16, #0x130]\n"
"fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "ldr q9, [x26, x10]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x26, x10]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
- "ldr q11, [x25, x10]\n"
- "ldp x26, x25, [x11, #0x0]\n"
+ "ldr q0, [x16, #0x150]\n"
"fmla v28.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v5.8h\n"
- "ldr q0, [x16, #0x150]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x10]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "ldr q12, [x24, x10]\n"
"ldr q1, [x16, #0x160]\n"
"fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q5, [x27, x17]\n"
"fmla v29.8h, v2.8h, v6.8h\n"
- "ldr q5, [x26, x28]\n"
"fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q9, [x23, x10]\n"
- "ldp x24, x23, [x11, #0x10]\n"
+ "ldr q2, [x16, #0x170]\n"
"fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x26, x17]\n"
"fmla v29.8h, v3.8h, v8.8h\n"
- "ldr q6, [x25, x28]\n"
- "ldp x22, x21, [x11, #0x20]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr q7, [x25, x17]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldp x20, x19, [x11, #0x30]\n"
- "ldp x26, x25, [x11, #0x40]\n"
+ "ldr q3, [x16, #0x180]\n"
"fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x24, x17]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v18.8h\n"
- "ldr q7, [x24, x28]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldr q13, [x22, x17]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x23, x17]\n"
+ "ldr q4, [x16, #0x190]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v18.8h\n"
- "ldr q8, [x23, x28]\n"
+ "ldr q11, [x21, x17]\n"
+ "ldr q12, [x20, x17]\n"
"fmax v30.8h, v30.8h, v18.8h\n"
"fmax v31.8h, v31.8h, v18.8h\n"
- "ldr q9, [x22, x28]\n"
- "ldr q13, [x21, x28]\n"
- "ldr q11, [x20, x28]\n"
- "ldr q12, [x19, x28]\n"
+ "ldp x27, x26, [x15, #0x40]\n"
+ "ldr q10, [x27, x17]\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "ldr q10, [x26, x28]\n"
- "ldr q14, [x25, x28]\n"
- "add x28, x28, #0x10\n"
- "cmp x28, x27, LSL #4\n"
+ "ldr q14, [x26, x17]\n"
+ "add x17, x17, #0x10\n"
+ "cmp x17, x9, LSL #4\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"add x10, x10, #0x10\n"
- "str q28, [x15, x9]\n"
- "str q29, [x14, x9]\n"
- "ldr q2, [x16, #0x170]\n"
- "ldr q3, [x16, #0x180]\n"
- "str q30, [x13, x9]\n"
- "ldr q4, [x16, #0x190]\n"
+ "str q28, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q31, [x12, x9]\n"
+ "str q29, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q31, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x24, [x11, #0x50]\n"
- "ldr q5, [x24, x10]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr q5, [x25, x10]\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
- "ldr x23, [x11, #0x58]\n"
- "ldr x22, [x11, #0x60]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x24, [x15, #0x58]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr x21, [x11, #0x68]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v31.8h, v1.8h, v13.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x20, [x11, #0x70]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x10]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q9, [x22, x10]\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v31.8h, v2.8h, v5.8h\n"
- "ldr x19, [x11, #0x78]\n"
"ldr q2, [x16, #0x20]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x21, x10]\n"
- "ldr x26, [x11, #0x80]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"fmla v31.8h, v3.8h, v6.8h\n"
"ldr q3, [x16, #0x30]\n"
- "ldr x25, [x11, #0x88]\n"
+ "ldr x26, [x15, #0x88]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
- "ldr q12, [x20, x10]\n"
- "ldr q9, [x19, x10]\n"
+ "ldr q9, [x20, x10]\n"
"fmla v30.8h, v4.8h, v6.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"ldr q4, [x16, #0x40]\n"
- "ldr x24, [x11, #0x90]\n"
+ "ldr x25, [x15, #0x90]\n"
"fmla v28.8h, v0.8h, v7.8h\n"
"fmla v29.8h, v0.8h, v8.8h\n"
- "ldr x23, [x11, #0x98]\n"
- "ldr x22, [x11, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "ldr x23, [x15, #0xa0]\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"fmla v31.8h, v0.8h, v11.8h\n"
"ldr q0, [x16, #0x50]\n"
- "ldr x21, [x11, #0xa8]\n"
+ "ldr x22, [x15, #0xa8]\n"
"fmla v28.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x26, x10]\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q8, [x25, x10]\n"
- "ldr x20, [x11, #0xb0]\n"
+ "ldr x21, [x15, #0xb0]\n"
"fmla v30.8h, v1.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"ldr q1, [x16, #0x60]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x27, x10]\n"
"fmla v29.8h, v2.8h, v5.8h\n"
- "ldr q13, [x26, x10]\n"
- "ldr x26, [x11, #0xc0]\n"
+ "ldr x27, [x15, #0xc0]\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v9.8h\n"
"ldr q2, [x16, #0x70]\n"
- "ldr x25, [x11, #0xc8]\n"
+ "ldr x26, [x15, #0xc8]\n"
"fmla v28.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.8h, v3.8h, v6.8h\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0xd0]\n"
+ "ldr x25, [x15, #0xd0]\n"
"fmla v30.8h, v3.8h, v9.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
"ldr q3, [x16, #0x80]\n"
- "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v28.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr q10, [x22, x10]\n"
+ "ldr q10, [x23, x10]\n"
"fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v8.8h\n"
"ldr q4, [x16, #0x90]\n"
- "ldr x23, [x11, #0xd8]\n"
+ "ldr x24, [x15, #0xd8]\n"
"fmla v28.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x20, x10]\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "ldr q14, [x19, x10]\n"
- "ldr x22, [x11, #0xe0]\n"
+ "ldr x23, [x15, #0xe0]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
"fmla v31.8h, v0.8h, v6.8h\n"
"ldr q0, [x16, #0xa0]\n"
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "ldr q11, [x21, x10]\n"
- "ldr x21, [x11, #0xe8]\n"
+ "ldr x22, [x15, #0xe8]\n"
"fmla v30.8h, v1.8h, v6.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"ldr q1, [x16, #0xb0]\n"
"fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.8h, v2.8h, v9.8h\n"
- "ldr q12, [x20, x10]\n"
- "ldr x20, [x11, #0xf0]\n"
+ "ldr x21, [x15, #0xf0]\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"ldr q2, [x16, #0xc0]\n"
"fmla v28.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "ldr q9, [x26, x10]\n"
- "ldr x26, [x11, #0x100]\n"
+ "ldr x27, [x15, #0x100]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"ldr q3, [x16, #0xd0]\n"
"fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x26, x10]\n"
"fmla v29.8h, v4.8h, v8.8h\n"
- "ldr q13, [x25, x10]\n"
- "ldr q8, [x22, x10]\n"
+ "ldr q8, [x23, x10]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"fmla v31.8h, v4.8h, v14.8h\n"
"ldr q4, [x16, #0xe0]\n"
- "ldr x25, [x11, #0x108]\n"
+ "ldr x26, [x15, #0x108]\n"
"fmla v28.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0x110]\n"
+ "ldr x25, [x15, #0x110]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v13.8h\n"
"ldr q0, [x16, #0xf0]\n"
"fmla v28.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.8h, v1.8h, v10.8h\n"
- "ldr q6, [x23, x10]\n"
- "ldr x23, [x11, #0x118]\n"
+ "ldr x24, [x15, #0x118]\n"
"fmla v30.8h, v1.8h, v13.8h\n"
"fmla v31.8h, v1.8h, v5.8h\n"
"ldr q1, [x16, #0x100]\n"
"fmla v28.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x22, x10]\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q10, [x21, x10]\n"
"fmla v30.8h, v2.8h, v5.8h\n"
"fmla v31.8h, v2.8h, v6.8h\n"
"ldr q2, [x16, #0x110]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x10]\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "ldr q11, [x20, x10]\n"
"fmla v30.8h, v3.8h, v6.8h\n"
"fmla v31.8h, v3.8h, v8.8h\n"
"ldr q3, [x16, #0x120]\n"
"fmla v28.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x10]\n"
"fmla v29.8h, v4.8h, v14.8h\n"
- "ldr q12, [x19, x10]\n"
"fmla v30.8h, v4.8h, v8.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"ldr q4, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
"fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "ldr q9, [x26, x10]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x26, x10]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
- "ldr q11, [x25, x10]\n"
"fmla v28.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x10]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "ldr q12, [x24, x10]\n"
"fmla v28.8h, v2.8h, v5.8h\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x10]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr q9, [x23, x10]\n"
"add x10, x10, #0x10\n"
"fmla v28.8h, v3.8h, v6.8h\n"
"fmla v29.8h, v3.8h, v8.8h\n"
@@ -530,134 +530,134 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmax v31.8h, v31.8h, v18.8h\n"
"fmin v28.8h, v28.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v17.8h\n"
- "str q28, [x15, x9]\n"
+ "str q28, [x14, x28]\n"
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
- "str q29, [x14, x9]\n"
- "str q30, [x13, x9]\n"
- "str q31, [x12, x9]\n"
+ "str q29, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q31, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 116f\n"
- "mov x9, x10\n"
- "ldr x28, [x11, #0x0]\n"
- "ldr x27, [x11, #0x8]\n"
- "ldr x26, [x11, #0x10]\n"
- "add x15, x15, x9\n"
- "add x14, x14, x9\n"
- "ldr x25, [x11, #0x18]\n"
- "ldr x24, [x11, #0x20]\n"
- "add x13, x13, x9\n"
- "add x12, x12, x9\n"
- "ldr x23, [x11, #0x28]\n"
- "ldr x22, [x11, #0x30]\n"
+ "ldr q16, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x28, x10\n"
+ "add x14, x14, x28\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, x28\n"
+ "ldr x9, [x15, #0x0]\n"
+ "ldr x28, [x15, #0x8]\n"
+ "add x9, x9, x10\n"
"add x28, x28, x10\n"
+ "ldr x27, [x15, #0x10]\n"
+ "ldr x26, [x15, #0x18]\n"
"add x27, x27, x10\n"
- "ldr x21, [x11, #0x38]\n"
- "ldr x20, [x11, #0x40]\n"
"add x26, x26, x10\n"
+ "ldr x25, [x15, #0x20]\n"
+ "ldr x24, [x15, #0x28]\n"
"add x25, x25, x10\n"
- "ldr x19, [x11, #0x48]\n"
- "ldr q16, [x16, #0x0]\n"
"add x24, x24, x10\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
"add x23, x23, x10\n"
- "ldr q0, [x16, #0x10]\n"
- "ldr q1, [x16, #0x20]\n"
"add x22, x22, x10\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
"add x21, x21, x10\n"
- "ldr q2, [x16, #0x30]\n"
- "ldr q3, [x16, #0x40]\n"
"add x20, x20, x10\n"
- "add x19, x19, x10\n"
- "ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v5.d }[0], [x28], #0x8\n"
- "ld1 { v6.d }[0], [x27], #0x8\n"
- "ld1 { v7.d }[0], [x26], #0x8\n"
- "ld1 { v8.d }[0], [x25], #0x8\n"
- "ld1 { v9.d }[0], [x24], #0x8\n"
- "ld1 { v13.d }[0], [x23], #0x8\n"
- "ld1 { v11.d }[0], [x22], #0x8\n"
- "ld1 { v12.d }[0], [x21], #0x8\n"
- "ld1 { v10.d }[0], [x20], #0x8\n"
- "ld1 { v14.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x9], #0x8\n"
+ "ld1 { v6.d }[0], [x28], #0x8\n"
+ "ld1 { v7.d }[0], [x27], #0x8\n"
+ "ld1 { v8.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x25], #0x8\n"
+ "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v5.s }[2], [x28], #0x4\n"
- "ld1 { v6.s }[2], [x27], #0x4\n"
- "ld1 { v7.s }[2], [x26], #0x4\n"
- "ld1 { v8.s }[2], [x25], #0x4\n"
- "ld1 { v9.s }[2], [x24], #0x4\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
- "ld1 { v11.s }[2], [x22], #0x4\n"
- "ld1 { v12.s }[2], [x21], #0x4\n"
- "ld1 { v10.s }[2], [x20], #0x4\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v5.h }[6], [x28], #0x2\n"
- "ld1 { v6.h }[6], [x27], #0x2\n"
- "ld1 { v7.h }[6], [x26], #0x2\n"
- "ld1 { v8.h }[6], [x25], #0x2\n"
- "ld1 { v9.h }[6], [x24], #0x2\n"
- "ld1 { v13.h }[6], [x23], #0x2\n"
- "ld1 { v11.h }[6], [x22], #0x2\n"
- "ld1 { v12.h }[6], [x21], #0x2\n"
- "ld1 { v10.h }[6], [x20], #0x2\n"
- "ld1 { v14.h }[6], [x19], #0x2\n"
+ "ld1 { v5.h }[6], [x9], #0x2\n"
+ "ld1 { v6.h }[6], [x28], #0x2\n"
+ "ld1 { v7.h }[6], [x27], #0x2\n"
+ "ld1 { v8.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x25], #0x2\n"
+ "ld1 { v13.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x22], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
"b 7f\n"
"4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v5.h }[4], [x28], #0x2\n"
- "ld1 { v6.h }[4], [x27], #0x2\n"
- "ld1 { v7.h }[4], [x26], #0x2\n"
- "ld1 { v8.h }[4], [x25], #0x2\n"
- "ld1 { v9.h }[4], [x24], #0x2\n"
- "ld1 { v13.h }[4], [x23], #0x2\n"
- "ld1 { v11.h }[4], [x22], #0x2\n"
- "ld1 { v12.h }[4], [x21], #0x2\n"
- "ld1 { v10.h }[4], [x20], #0x2\n"
- "ld1 { v14.h }[4], [x19], #0x2\n"
+ "ld1 { v5.h }[4], [x9], #0x2\n"
+ "ld1 { v6.h }[4], [x28], #0x2\n"
+ "ld1 { v7.h }[4], [x27], #0x2\n"
+ "ld1 { v8.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x25], #0x2\n"
+ "ld1 { v13.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x22], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
"b 7f\n"
"5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v5.s }[0], [x28], #0x4\n"
- "ld1 { v6.s }[0], [x27], #0x4\n"
- "ld1 { v7.s }[0], [x26], #0x4\n"
- "ld1 { v8.s }[0], [x25], #0x4\n"
- "ld1 { v9.s }[0], [x24], #0x4\n"
- "ld1 { v13.s }[0], [x23], #0x4\n"
- "ld1 { v11.s }[0], [x22], #0x4\n"
- "ld1 { v12.s }[0], [x21], #0x4\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "ld1 { v14.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x9], #0x4\n"
+ "ld1 { v6.s }[0], [x28], #0x4\n"
+ "ld1 { v7.s }[0], [x27], #0x4\n"
+ "ld1 { v8.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x25], #0x4\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v5.h }[2], [x28], #0x2\n"
- "ld1 { v6.h }[2], [x27], #0x2\n"
- "ld1 { v7.h }[2], [x26], #0x2\n"
- "ld1 { v8.h }[2], [x25], #0x2\n"
- "ld1 { v9.h }[2], [x24], #0x2\n"
- "ld1 { v13.h }[2], [x23], #0x2\n"
- "ld1 { v11.h }[2], [x22], #0x2\n"
- "ld1 { v12.h }[2], [x21], #0x2\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "ld1 { v14.h }[2], [x19], #0x2\n"
+ "ld1 { v5.h }[2], [x9], #0x2\n"
+ "ld1 { v6.h }[2], [x28], #0x2\n"
+ "ld1 { v7.h }[2], [x27], #0x2\n"
+ "ld1 { v8.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x25], #0x2\n"
+ "ld1 { v13.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x22], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"b 7f\n"
"6:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v5.h }[0], [x28], #0x2\n"
- "ld1 { v6.h }[0], [x27], #0x2\n"
- "ld1 { v7.h }[0], [x26], #0x2\n"
- "ld1 { v8.h }[0], [x25], #0x2\n"
- "ld1 { v9.h }[0], [x24], #0x2\n"
- "ld1 { v13.h }[0], [x23], #0x2\n"
- "ld1 { v11.h }[0], [x22], #0x2\n"
- "ld1 { v12.h }[0], [x21], #0x2\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "ld1 { v14.h }[0], [x19], #0x2\n"
+ "ld1 { v5.h }[0], [x9], #0x2\n"
+ "ld1 { v6.h }[0], [x28], #0x2\n"
+ "ld1 { v7.h }[0], [x27], #0x2\n"
+ "ld1 { v8.h }[0], [x26], #0x2\n"
+ "ld1 { v9.h }[0], [x25], #0x2\n"
+ "ld1 { v13.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x22], #0x2\n"
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
"mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "ldr x19, [x11, #0x50]\n"
- "add x19, x19, x10\n"
+ "ldr x20, [x15, #0x50]\n"
+ "add x20, x20, x10\n"
"mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
@@ -668,701 +668,701 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"tbz %x[n_channels], #2, 9f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v5.h }[6], [x19], #0x2\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
"b 11f\n"
"8:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v5.h }[4], [x19], #0x2\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
"b 11f\n"
"9:" // Oddments: Load input (1, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v5.h }[2], [x19], #0x2\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
"b 11f\n"
"10:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v5.h }[0], [x19], #0x2\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x19, [x11, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v31.8h, v2.8h, v5.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v6.h }[6], [x19], #0x2\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
"b 15f\n"
"12:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v6.h }[4], [x19], #0x2\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
"b 15f\n"
"13:" // Oddments: Load input (1, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v6.h }[2], [x19], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"b 15f\n"
"14:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v6.h }[0], [x19], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x19, [x11, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v31.8h, v3.8h, v6.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: Load input (0, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: Load input (0, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: Load input (0, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 5): Bit 2: End
+ "ldr q0, [x16, #0x0]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
"fmla v30.8h, v4.8h, v6.8h\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v7.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"fmla v29.8h, v0.8h, v8.8h\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 21f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v11.h }[6], [x19], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 23f\n"
"20:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v11.h }[4], [x19], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 23f\n"
"21:" // Oddments: Load input (2, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v11.h }[2], [x19], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 23f\n"
"22:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x19], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (2, 1): Bit 2: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v31.8h, v0.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v8.8h\n"
"fmla v29.8h, v1.8h, v13.8h\n"
"fmla v30.8h, v1.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 25f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 27f\n"
"24:" // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 27f\n"
"25:" // Oddments: Load input (2, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 27f\n"
"26:" // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 2): Bit 2: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v29.8h, v2.8h, v5.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 29f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v9.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 31f\n"
"28:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v9.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 31f\n"
"29:" // Oddments: Load input (2, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v9.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 31f\n"
"30:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (2, 3): Bit 2: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
"fmla v28.8h, v3.8h, v5.8h\n"
"fmla v29.8h, v3.8h, v6.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 33f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[6], [x19], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 35f\n"
"32:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[4], [x19], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 35f\n"
"33:" // Oddments: Load input (2, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v13.h }[2], [x19], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 35f\n"
"34:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x19], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (2, 4): Bit 2: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
"fmla v28.8h, v4.8h, v6.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 37f\n"
- "ld1 { v8.d }[0], [x19], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v8.h }[6], [x19], #0x2\n"
+ "ld1 { v8.h }[6], [x20], #0x2\n"
"b 39f\n"
"36:" // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v8.h }[4], [x19], #0x2\n"
+ "ld1 { v8.h }[4], [x20], #0x2\n"
"b 39f\n"
"37:" // Oddments: Load input (2, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v8.s }[0], [x19], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v8.h }[2], [x19], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"b 39f\n"
"38:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v8.h }[0], [x19], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (2, 5): Bit 2: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v31.8h, v4.8h, v8.8h\n"
"fmla v28.8h, v0.8h, v14.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 41f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v5.h }[6], [x19], #0x2\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
"b 43f\n"
"40:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v5.h }[4], [x19], #0x2\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
"b 43f\n"
"41:" // Oddments: Load input (3, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v5.h }[2], [x19], #0x2\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
"b 43f\n"
"42:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v5.h }[0], [x19], #0x2\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x19, [x11, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #2, 45f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v6.h }[6], [x19], #0x2\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
"b 47f\n"
"44:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v6.h }[4], [x19], #0x2\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
"b 47f\n"
"45:" // Oddments: Load input (3, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v6.h }[2], [x19], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"b 47f\n"
"46:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v6.h }[0], [x19], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 1): Bit 2: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v31.8h, v0.8h, v6.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v30.8h, v1.8h, v6.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 49f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v10.h }[6], [x19], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 51f\n"
"48:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v10.h }[4], [x19], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 51f\n"
"49:" // Oddments: Load input (3, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v10.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v10.h }[2], [x19], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 51f\n"
"50:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x19], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (3, 2): Bit 2: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v31.8h, v1.8h, v10.8h\n"
"fmla v28.8h, v2.8h, v12.8h\n"
"fmla v29.8h, v2.8h, v9.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 53f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[6], [x19], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 55f\n"
"52:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[4], [x19], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 55f\n"
"53:" // Oddments: Load input (3, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v11.h }[2], [x19], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 55f\n"
"54:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x19], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (3, 3): Bit 2: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v9.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 57f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 59f\n"
"56:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 59f\n"
"57:" // Oddments: Load input (3, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 58f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 59f\n"
"58:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (3, 4): Bit 2: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v4.8h, v13.8h\n"
"fmla v29.8h, v4.8h, v8.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 61f\n"
- "ld1 { v14.d }[0], [x19], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v14.h }[6], [x19], #0x2\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
"b 63f\n"
"60:" // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v14.h }[4], [x19], #0x2\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
"b 63f\n"
"61:" // Oddments: Load input (3, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 62f\n"
- "ld1 { v14.s }[0], [x19], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v14.h }[2], [x19], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
"b 63f\n"
"62:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v14.h }[0], [x19], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (3, 5): Bit 2: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v31.8h, v4.8h, v14.8h\n"
"fmla v28.8h, v0.8h, v5.8h\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 65f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v9.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 67f\n"
"64:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v9.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 67f\n"
"65:" // Oddments: Load input (4, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 66f\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v9.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 67f\n"
"66:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x19, [x11, #0xc8]\n"
+ "ldr x20, [x15, #0xc8]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #2, 69f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v13.h }[6], [x19], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
"b 71f\n"
"68:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v13.h }[4], [x19], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
"b 71f\n"
"69:" // Oddments: Load input (4, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 70f\n"
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 71f\n"
- "ld1 { v13.h }[2], [x19], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
"b 71f\n"
"70:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v13.h }[0], [x19], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (4, 1): Bit 2: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0xd0]\n"
+ "ldr x20, [x15, #0xd0]\n"
"fmla v31.8h, v0.8h, v13.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v13.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 73f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v5.h }[6], [x19], #0x2\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
"b 75f\n"
"72:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v5.h }[4], [x19], #0x2\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
"b 75f\n"
"73:" // Oddments: Load input (4, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 74f\n"
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 75f\n"
- "ld1 { v5.h }[2], [x19], #0x2\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
"b 75f\n"
"74:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v5.h }[0], [x19], #0x2\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
"75:" // Oddments: Load input (4, 2): Bit 2: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0xd8]\n"
+ "ldr x20, [x15, #0xd8]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
"fmla v28.8h, v2.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v5.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 77f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v6.h }[6], [x19], #0x2\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
"b 79f\n"
"76:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v6.h }[4], [x19], #0x2\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
"b 79f\n"
"77:" // Oddments: Load input (4, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 78f\n"
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "ld1 { v6.h }[2], [x19], #0x2\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
"b 79f\n"
"78:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v6.h }[0], [x19], #0x2\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
"79:" // Oddments: Load input (4, 3): Bit 2: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0xe0]\n"
+ "ldr x20, [x15, #0xe0]\n"
"fmla v31.8h, v2.8h, v6.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v6.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 81f\n"
- "ld1 { v8.d }[0], [x19], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v8.h }[6], [x19], #0x2\n"
+ "ld1 { v8.h }[6], [x20], #0x2\n"
"b 83f\n"
"80:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v8.h }[4], [x19], #0x2\n"
+ "ld1 { v8.h }[4], [x20], #0x2\n"
"b 83f\n"
"81:" // Oddments: Load input (4, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 82f\n"
- "ld1 { v8.s }[0], [x19], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 83f\n"
- "ld1 { v8.h }[2], [x19], #0x2\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
"b 83f\n"
"82:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v8.h }[0], [x19], #0x2\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
"83:" // Oddments: Load input (4, 4): Bit 2: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0xe8]\n"
+ "ldr x20, [x15, #0xe8]\n"
"fmla v31.8h, v3.8h, v8.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v14.8h\n"
"fmla v30.8h, v4.8h, v8.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 85f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[6], [x19], #0x2\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
"b 87f\n"
"84:" // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[4], [x19], #0x2\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
"b 87f\n"
"85:" // Oddments: Load input (4, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 86f\n"
- "ld1 { v10.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 87f\n"
- "ld1 { v10.h }[2], [x19], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
"b 87f\n"
"86:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v10.h }[0], [x19], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
"87:" // Oddments: Load input (4, 5): Bit 2: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0xf0]\n"
+ "ldr x20, [x15, #0xf0]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"fmla v28.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 89f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 88f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[6], [x19], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 91f\n"
"88:" // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[4], [x19], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 91f\n"
"89:" // Oddments: Load input (5, 0): Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 91f\n"
- "ld1 { v11.h }[2], [x19], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 91f\n"
"90:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x19], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"91:" // Oddments: Load input (5, 0): Bit 2: End
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #2, 93f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 92f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 95f\n"
"92:" // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 95f\n"
"93:" // Oddments: Load input (5, 1): Bit 2: Unset
"tbz %x[n_channels], #1, 94f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 95f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 95f\n"
"94:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"95:" // Oddments: Load input (5, 1): Bit 2: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0x100]\n"
+ "ldr x20, [x15, #0x100]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
"fmla v28.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 97f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 96f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v9.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 99f\n"
"96:" // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v9.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 99f\n"
"97:" // Oddments: Load input (5, 2): Bit 2: Unset
"tbz %x[n_channels], #1, 98f\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 99f\n"
- "ld1 { v9.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 99f\n"
"98:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"99:" // Oddments: Load input (5, 2): Bit 2: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0x108]\n"
+ "ldr x20, [x15, #0x108]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v5.8h\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 101f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 100f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[6], [x19], #0x2\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
"b 103f\n"
"100:" // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[4], [x19], #0x2\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
"b 103f\n"
"101:" // Oddments: Load input (5, 3): Bit 2: Unset
"tbz %x[n_channels], #1, 102f\n"
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 103f\n"
- "ld1 { v11.h }[2], [x19], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"b 103f\n"
"102:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
- "ld1 { v11.h }[0], [x19], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"103:" // Oddments: Load input (5, 3): Bit 2: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0x110]\n"
+ "ldr x20, [x15, #0x110]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v6.8h\n"
"fmla v29.8h, v3.8h, v8.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 105f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 104f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[6], [x19], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
"b 107f\n"
"104:" // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[4], [x19], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
"b 107f\n"
"105:" // Oddments: Load input (5, 4): Bit 2: Unset
"tbz %x[n_channels], #1, 106f\n"
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 107f\n"
- "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
"b 107f\n"
"106:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
- "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
"107:" // Oddments: Load input (5, 4): Bit 2: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0x118]\n"
+ "ldr x20, [x15, #0x118]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
"fmla v28.8h, v4.8h, v8.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #2, 109f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 108f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v9.h }[6], [x19], #0x2\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
"b 111f\n"
"108:" // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v9.h }[4], [x19], #0x2\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
"b 111f\n"
"109:" // Oddments: Load input (5, 5): Bit 2: Unset
"tbz %x[n_channels], #1, 110f\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 111f\n"
- "ld1 { v9.h }[2], [x19], #0x2\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
"b 111f\n"
"110:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
- "ld1 { v9.h }[0], [x19], #0x2\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
"111:" // Oddments: Load input (5, 5): Bit 2: End
"fmla v31.8h, v4.8h, v9.8h\n"
"fmax v28.8h, v28.8h, v18.8h\n"
@@ -1374,52 +1374,50 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmin v30.8h, v30.8h, v17.8h\n"
"fmin v31.8h, v31.8h, v17.8h\n"
"tbz %x[n_channels], #2, 113f\n"
- "st1 { v28.d }[0], [x15], #0x8\n"
- "st1 { v29.d }[0], [x14], #0x8\n"
- "st1 { v30.d }[0], [x13], #0x8\n"
- "st1 { v31.d }[0], [x12], #0x8\n"
+ "st1 { v28.d }[0], [x14], #0x8\n"
+ "st1 { v29.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v31.d }[0], [x11], #0x8\n"
"tbz %x[n_channels], #1, 112f\n"
- "st1 { v28.s }[2], [x15], #0x4\n"
- "st1 { v29.s }[2], [x14], #0x4\n"
- "st1 { v30.s }[2], [x13], #0x4\n"
- "st1 { v31.s }[2], [x12], #0x4\n"
+ "st1 { v28.s }[2], [x14], #0x4\n"
+ "st1 { v29.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[6], [x15], #0x2\n"
- "st1 { v29.h }[6], [x14], #0x2\n"
- "st1 { v30.h }[6], [x13], #0x2\n"
- "st1 { v31.h }[6], [x12], #0x2\n"
+ "st1 { v28.h }[6], [x14], #0x2\n"
+ "st1 { v29.h }[6], [x13], #0x2\n"
+ "st1 { v30.h }[6], [x12], #0x2\n"
+ "st1 { v31.h }[6], [x11], #0x2\n"
"b 115f\n"
"112:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[4], [x15], #0x2\n"
- "st1 { v29.h }[4], [x14], #0x2\n"
- "st1 { v30.h }[4], [x13], #0x2\n"
- "st1 { v31.h }[4], [x12], #0x2\n"
+ "st1 { v28.h }[4], [x14], #0x2\n"
+ "st1 { v29.h }[4], [x13], #0x2\n"
+ "st1 { v30.h }[4], [x12], #0x2\n"
+ "st1 { v31.h }[4], [x11], #0x2\n"
"b 115f\n"
"113:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 114f\n"
- "st1 { v28.s }[0], [x15], #0x4\n"
- "st1 { v29.s }[0], [x14], #0x4\n"
- "st1 { v30.s }[0], [x13], #0x4\n"
- "st1 { v31.s }[0], [x12], #0x4\n"
+ "st1 { v28.s }[0], [x14], #0x4\n"
+ "st1 { v29.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v31.s }[0], [x11], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[2], [x15], #0x2\n"
- "st1 { v29.h }[2], [x14], #0x2\n"
- "st1 { v30.h }[2], [x13], #0x2\n"
- "st1 { v31.h }[2], [x12], #0x2\n"
+ "st1 { v28.h }[2], [x14], #0x2\n"
+ "st1 { v29.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v31.h }[2], [x11], #0x2\n"
"b 115f\n"
"114:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "st1 { v28.h }[0], [x15], #0x2\n"
- "st1 { v29.h }[0], [x14], #0x2\n"
- "st1 { v30.h }[0], [x13], #0x2\n"
- "st1 { v31.h }[0], [x12], #0x2\n"
+ "st1 { v28.h }[0], [x14], #0x2\n"
+ "st1 { v29.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
"115:" // Oddments: Store: Bit 2: End
-
"116:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 423ee4190c..418530fdc4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,480 +44,475 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
const __fp16 minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v4.8h }, [%x[minmax_vals]]\n"
- "add x19, %x[minmax_vals], #0x2\n"
+ "ld1r { v2.8h }, [%x[minmax_vals]]\n"
+ "lsr x12, %x[n_channels], #0x3\n"
+ "add x20, %x[minmax_vals], #0x2\n"
+ "ld1r { v1.8h }, [x20]\n"
"mov x11, #0x0\n"
- "ld1r { v3.8h }, [x19]\n"
- "lsr x10, %x[n_channels], #0x3\n"
- "cbz x10, 5f\n"
+ "cbz x12, 5f\n"
"1:" // Channel loop
- "movi v25.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q25, [%x[bias], x11]\n"
+ "ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
- "mov v24.16b, v25.16b\n"
- "ldr q23, [%x[params], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "mov v22.16b, v25.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, %x[n_points], #0x1\n"
- "mov v21.16b, v25.16b\n"
- "ldr q2, [x9, x11]\n"
- "mov v20.16b, v25.16b\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ldr q14, [x10, x11]\n"
+ "ldr q15, [x9, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldr q16, [x28, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr q17, [x27, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr q18, [x26, x11]\n"
+ "ldr q19, [x25, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr q20, [x24, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "mov v19.16b, v25.16b\n"
- "ldr q1, [x28, x11]\n"
- "mov v18.16b, v25.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v17.16b, v25.16b\n"
- "ldr q0, [x27, x11]\n"
- "mov v16.16b, v25.16b\n"
- "ldr q31, [x26, x11]\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ldr q30, [x25, x11]\n"
- "ldr q29, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ldr q28, [x23, x11]\n"
- "ldr q27, [x22, x11]\n"
- "ldr x21, [x20], #0x8\n"
- "ldr q26, [x21, x11]\n"
+ "ldr q21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ldr q22, [x22, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "fmla v25.8h, v2.8h, v23.8h\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "fmla v24.8h, v1.8h, v23.8h\n"
- "ldr q2, [x9, x11]\n"
- "fmla v22.8h, v0.8h, v23.8h\n"
- "fmla v21.8h, v31.8h, v23.8h\n"
- "ldr q1, [x28, x11]\n"
- "fmla v20.8h, v30.8h, v23.8h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "fmla v19.8h, v29.8h, v23.8h\n"
- "fmla v18.8h, v28.8h, v23.8h\n"
- "ldr q0, [x27, x11]\n"
- "fmla v17.8h, v27.8h, v23.8h\n"
- "fmla v16.8h, v26.8h, v23.8h\n"
- "ldr q23, [%x[params], #0x0]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "ldr q14, [x10, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "ldr q15, [x9, x11]\n"
+ "ldr q16, [x28, x11]\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "ldr q17, [x27, x11]\n"
+ "ldr q18, [x26, x11]\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "ldr q19, [x25, x11]\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "ldr q20, [x24, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q31, [x26, x11]\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ldr q30, [x25, x11]\n"
- "ldr q29, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ldr q28, [x23, x11]\n"
- "ldr q27, [x22, x11]\n"
- "ldr x21, [x20], #0x8\n"
- "ldr q26, [x21, x11]\n"
+ "ldr q21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ldr q22, [x22, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "fmla v25.8h, v2.8h, v23.8h\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "fmla v24.8h, v1.8h, v23.8h\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "fmla v22.8h, v0.8h, v23.8h\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
- "fmla v21.8h, v31.8h, v23.8h\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "fmla v20.8h, v30.8h, v23.8h\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmla v19.8h, v29.8h, v23.8h\n"
- "fmla v18.8h, v28.8h, v23.8h\n"
- "fmla v17.8h, v27.8h, v23.8h\n"
- "fmla v16.8h, v26.8h, v23.8h\n"
- "fmax v25.8h, v25.8h, v4.8h\n"
- "fmax v24.8h, v24.8h, v4.8h\n"
- "fmax v22.8h, v22.8h, v4.8h\n"
- "fmin v25.8h, v25.8h, v3.8h\n"
- "str q25, [x27, x11]\n"
- "fmin v24.8h, v24.8h, v3.8h\n"
- "fmin v22.8h, v22.8h, v3.8h\n"
- "str q24, [x26, x11]\n"
- "fmax v21.8h, v21.8h, v4.8h\n"
- "fmax v20.8h, v20.8h, v4.8h\n"
- "str q22, [x25, x11]\n"
- "fmax v19.8h, v19.8h, v4.8h\n"
- "fmax v18.8h, v18.8h, v4.8h\n"
- "fmin v21.8h, v21.8h, v3.8h\n"
- "str q21, [x24, x11]\n"
- "fmin v20.8h, v20.8h, v3.8h\n"
- "fmin v19.8h, v19.8h, v3.8h\n"
- "str q20, [x23, x11]\n"
- "fmin v18.8h, v18.8h, v3.8h\n"
- "fmax v17.8h, v17.8h, v4.8h\n"
- "str q19, [x22, x11]\n"
- "fmax v16.8h, v16.8h, v4.8h\n"
- "str q18, [x21, x11]\n"
- "fmin v17.8h, v17.8h, v3.8h\n"
- "fmin v16.8h, v16.8h, v3.8h\n"
- "str q17, [x20, x11]\n"
- "str q16, [x19, x11]\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v2.8h\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v28.8h, v28.8h, v2.8h\n"
+ "fmax v29.8h, v29.8h, v2.8h\n"
+ "fmax v30.8h, v30.8h, v2.8h\n"
+ "fmax v31.8h, v31.8h, v2.8h\n"
+ "fmin v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v1.8h\n"
+ "str q23, [x28, x11]\n"
+ "fmin v25.8h, v25.8h, v1.8h\n"
+ "fmin v26.8h, v26.8h, v1.8h\n"
+ "str q24, [x27, x11]\n"
+ "fmin v27.8h, v27.8h, v1.8h\n"
+ "fmin v28.8h, v28.8h, v1.8h\n"
+ "str q25, [x26, x11]\n"
+ "fmin v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v1.8h\n"
+ "str q26, [x25, x11]\n"
+ "fmin v31.8h, v31.8h, v1.8h\n"
+ "str q27, [x24, x11]\n"
+ "str q28, [x23, x11]\n"
+ "str q29, [x22, x11]\n"
+ "str q30, [x21, x11]\n"
+ "str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x10, LSL #4\n"
+ "cmp x11, x12, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 25f\n"
- "movi v25.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
"cbz %x[bias], 10f\n"
- "add x19, %x[bias], x11\n"
+ "add x20, %x[bias], x11\n"
"tbz %x[n_channels], #2, 7f\n"
- "ld1 { v25.d }[0], [x19], #0x8\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v25.s }[2], [x19], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.h }[6], [x19], #0x2\n"
+ "ld1 { v23.h }[6], [x20], #0x2\n"
"b 9f\n"
"6:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.h }[4], [x19], #0x2\n"
+ "ld1 { v23.h }[4], [x20], #0x2\n"
"b 9f\n"
"7:" // Oddments: Load bias: Bit 2: Unset
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v25.s }[0], [x19], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.h }[2], [x19], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"b 9f\n"
"8:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 9f\n"
- "ld1 { v25.h }[0], [x19], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"9:" // Oddments: Load bias: Bit 2: End
-
"10:" // Oddments: Load bias: Done
- "mov v24.16b, v25.16b\n"
- "ldr q23, [%x[params], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "mov v22.16b, v25.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "add %x[params], %x[params], #0x10\n"
- "mov v21.16b, v25.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v20.16b, v25.16b\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr x22, [x21], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v19.16b, v25.16b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "mov v18.16b, v25.16b\n"
+ "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
- "mov v17.16b, v25.16b\n"
- "ldp x23, x22, [x20], #0x10\n"
- "mov v16.16b, v25.16b\n"
"add x27, x27, x11\n"
- "ldr x21, [x20], #0x8\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 12f\n"
- "ldr d2, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d28, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
+ "ldr d14, [x10], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d19, [x25], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz %x[n_channels], #1, 11f\n"
- "ld1 { v2.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
- "ld1 { v29.s }[2], [x24], #0x4\n"
- "ld1 { v28.s }[2], [x23], #0x4\n"
- "ld1 { v27.s }[2], [x22], #0x4\n"
- "ld1 { v26.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x10], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v2.h }[6], [x9], #0x2\n"
- "ld1 { v1.h }[6], [x28], #0x2\n"
- "ld1 { v0.h }[6], [x27], #0x2\n"
- "ld1 { v31.h }[6], [x26], #0x2\n"
- "ld1 { v30.h }[6], [x25], #0x2\n"
- "ld1 { v29.h }[6], [x24], #0x2\n"
- "ld1 { v28.h }[6], [x23], #0x2\n"
- "ld1 { v27.h }[6], [x22], #0x2\n"
- "ld1 { v26.h }[6], [x21], #0x2\n"
+ "ld1 { v14.h }[6], [x10], #0x2\n"
+ "ld1 { v15.h }[6], [x9], #0x2\n"
+ "ld1 { v16.h }[6], [x28], #0x2\n"
+ "ld1 { v17.h }[6], [x27], #0x2\n"
+ "ld1 { v18.h }[6], [x26], #0x2\n"
+ "ld1 { v19.h }[6], [x25], #0x2\n"
+ "ld1 { v20.h }[6], [x24], #0x2\n"
+ "ld1 { v21.h }[6], [x23], #0x2\n"
+ "ld1 { v22.h }[6], [x22], #0x2\n"
"b 14f\n"
"11:" // Oddments: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v2.h }[4], [x9], #0x2\n"
- "ld1 { v1.h }[4], [x28], #0x2\n"
- "ld1 { v0.h }[4], [x27], #0x2\n"
- "ld1 { v31.h }[4], [x26], #0x2\n"
- "ld1 { v30.h }[4], [x25], #0x2\n"
- "ld1 { v29.h }[4], [x24], #0x2\n"
- "ld1 { v28.h }[4], [x23], #0x2\n"
- "ld1 { v27.h }[4], [x22], #0x2\n"
- "ld1 { v26.h }[4], [x21], #0x2\n"
+ "ld1 { v14.h }[4], [x10], #0x2\n"
+ "ld1 { v15.h }[4], [x9], #0x2\n"
+ "ld1 { v16.h }[4], [x28], #0x2\n"
+ "ld1 { v17.h }[4], [x27], #0x2\n"
+ "ld1 { v18.h }[4], [x26], #0x2\n"
+ "ld1 { v19.h }[4], [x25], #0x2\n"
+ "ld1 { v20.h }[4], [x24], #0x2\n"
+ "ld1 { v21.h }[4], [x23], #0x2\n"
+ "ld1 { v22.h }[4], [x22], #0x2\n"
"b 14f\n"
"12:" // Oddments: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 13f\n"
- "ldr s2, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
- "ldr s29, [x24], #0x4\n"
- "ldr s28, [x23], #0x4\n"
- "ldr s27, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s14, [x10], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s20, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v2.h }[2], [x9], #0x2\n"
- "ld1 { v1.h }[2], [x28], #0x2\n"
- "ld1 { v0.h }[2], [x27], #0x2\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v14.h }[2], [x10], #0x2\n"
+ "ld1 { v15.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v17.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v19.h }[2], [x25], #0x2\n"
+ "ld1 { v20.h }[2], [x24], #0x2\n"
+ "ld1 { v21.h }[2], [x23], #0x2\n"
+ "ld1 { v22.h }[2], [x22], #0x2\n"
"b 14f\n"
"13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 14f\n"
- "ldr h2, [x9], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h0, [x27], #0x2\n"
- "ldr h31, [x26], #0x2\n"
- "ldr h30, [x25], #0x2\n"
- "ldr h29, [x24], #0x2\n"
- "ldr h28, [x23], #0x2\n"
- "ldr h27, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"14:" // Oddments: Load: Bit 2: End
- "subs x19, %x[n_points], #0x1\n"
+ "subs x20, %x[n_points], #0x1\n"
"ble 20f\n"
"15:" // Oddments: Planar loop
- "fmla v25.8h, v2.8h, v23.8h\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "ldr x22, [x21], #0x8\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "add x10, x10, x11\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
"add x9, x9, x11\n"
- "fmla v24.8h, v1.8h, v23.8h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "fmla v22.8h, v0.8h, v23.8h\n"
- "ldp x25, x24, [x20], #0x10\n"
- "fmla v21.8h, v31.8h, v23.8h\n"
"add x28, x28, x11\n"
- "fmla v20.8h, v30.8h, v23.8h\n"
- "ldp x23, x22, [x20], #0x10\n"
- "fmla v19.8h, v29.8h, v23.8h\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "ldr q0, [%x[params], #0x0]\n"
"add x27, x27, x11\n"
- "fmla v18.8h, v28.8h, v23.8h\n"
- "ldr x21, [x20], #0x8\n"
- "fmla v17.8h, v27.8h, v23.8h\n"
"add x26, x26, x11\n"
- "fmla v16.8h, v26.8h, v23.8h\n"
- "ldr q23, [%x[params], #0x0]\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d2, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d28, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
+ "ldr d14, [x10], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d19, [x25], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v2.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
- "ld1 { v29.s }[2], [x24], #0x4\n"
- "ld1 { v28.s }[2], [x23], #0x4\n"
- "ld1 { v27.s }[2], [x22], #0x4\n"
- "ld1 { v26.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x10], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v2.h }[6], [x9], #0x2\n"
- "ld1 { v1.h }[6], [x28], #0x2\n"
- "ld1 { v0.h }[6], [x27], #0x2\n"
- "ld1 { v31.h }[6], [x26], #0x2\n"
- "ld1 { v30.h }[6], [x25], #0x2\n"
- "ld1 { v29.h }[6], [x24], #0x2\n"
- "ld1 { v28.h }[6], [x23], #0x2\n"
- "ld1 { v27.h }[6], [x22], #0x2\n"
- "ld1 { v26.h }[6], [x21], #0x2\n"
+ "ld1 { v14.h }[6], [x10], #0x2\n"
+ "ld1 { v15.h }[6], [x9], #0x2\n"
+ "ld1 { v16.h }[6], [x28], #0x2\n"
+ "ld1 { v17.h }[6], [x27], #0x2\n"
+ "ld1 { v18.h }[6], [x26], #0x2\n"
+ "ld1 { v19.h }[6], [x25], #0x2\n"
+ "ld1 { v20.h }[6], [x24], #0x2\n"
+ "ld1 { v21.h }[6], [x23], #0x2\n"
+ "ld1 { v22.h }[6], [x22], #0x2\n"
"b 19f\n"
"16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v2.h }[4], [x9], #0x2\n"
- "ld1 { v1.h }[4], [x28], #0x2\n"
- "ld1 { v0.h }[4], [x27], #0x2\n"
- "ld1 { v31.h }[4], [x26], #0x2\n"
- "ld1 { v30.h }[4], [x25], #0x2\n"
- "ld1 { v29.h }[4], [x24], #0x2\n"
- "ld1 { v28.h }[4], [x23], #0x2\n"
- "ld1 { v27.h }[4], [x22], #0x2\n"
- "ld1 { v26.h }[4], [x21], #0x2\n"
+ "ld1 { v14.h }[4], [x10], #0x2\n"
+ "ld1 { v15.h }[4], [x9], #0x2\n"
+ "ld1 { v16.h }[4], [x28], #0x2\n"
+ "ld1 { v17.h }[4], [x27], #0x2\n"
+ "ld1 { v18.h }[4], [x26], #0x2\n"
+ "ld1 { v19.h }[4], [x25], #0x2\n"
+ "ld1 { v20.h }[4], [x24], #0x2\n"
+ "ld1 { v21.h }[4], [x23], #0x2\n"
+ "ld1 { v22.h }[4], [x22], #0x2\n"
"b 19f\n"
"17:" // Oddments: Planar loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s2, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
- "ldr s29, [x24], #0x4\n"
- "ldr s28, [x23], #0x4\n"
- "ldr s27, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s14, [x10], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s20, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v2.h }[2], [x9], #0x2\n"
- "ld1 { v1.h }[2], [x28], #0x2\n"
- "ld1 { v0.h }[2], [x27], #0x2\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v14.h }[2], [x10], #0x2\n"
+ "ld1 { v15.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v17.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v19.h }[2], [x25], #0x2\n"
+ "ld1 { v20.h }[2], [x24], #0x2\n"
+ "ld1 { v21.h }[2], [x23], #0x2\n"
+ "ld1 { v22.h }[2], [x22], #0x2\n"
"b 19f\n"
"18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 19f\n"
- "ldr h2, [x9], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h0, [x27], #0x2\n"
- "ldr h31, [x26], #0x2\n"
- "ldr h30, [x25], #0x2\n"
- "ldr h29, [x24], #0x2\n"
- "ldr h28, [x23], #0x2\n"
- "ldr h27, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"19:" // Oddments: Planar loop: Load: Bit 2: End
- "subs x19, x19, #0x1\n"
+ "subs x20, x20, #0x1\n"
"bgt 15b\n"
"20:" // Oddments: Planar tail
- "fmla v25.8h, v2.8h, v23.8h\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v2.8h\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "fmax v28.8h, v28.8h, v2.8h\n"
+ "fmax v29.8h, v29.8h, v2.8h\n"
"add x27, x27, x11\n"
- "fmla v24.8h, v1.8h, v23.8h\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "fmla v22.8h, v0.8h, v23.8h\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
"add x26, x26, x11\n"
- "fmla v21.8h, v31.8h, v23.8h\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "fmla v20.8h, v30.8h, v23.8h\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmax v30.8h, v30.8h, v2.8h\n"
+ "fmax v31.8h, v31.8h, v2.8h\n"
"add x25, x25, x11\n"
- "fmla v19.8h, v29.8h, v23.8h\n"
"add x24, x24, x11\n"
- "fmla v18.8h, v28.8h, v23.8h\n"
+ "fmin v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v1.8h\n"
"add x23, x23, x11\n"
- "fmla v17.8h, v27.8h, v23.8h\n"
"add x22, x22, x11\n"
- "fmla v16.8h, v26.8h, v23.8h\n"
+ "fmin v25.8h, v25.8h, v1.8h\n"
+ "fmin v26.8h, v26.8h, v1.8h\n"
"add x21, x21, x11\n"
- "fmax v25.8h, v25.8h, v4.8h\n"
"add x20, x20, x11\n"
- "fmax v24.8h, v24.8h, v4.8h\n"
- "add x19, x19, x11\n"
- "fmax v22.8h, v22.8h, v4.8h\n"
- "fmin v25.8h, v25.8h, v3.8h\n"
- "fmin v24.8h, v24.8h, v3.8h\n"
- "fmin v22.8h, v22.8h, v3.8h\n"
- "fmax v21.8h, v21.8h, v4.8h\n"
- "fmax v20.8h, v20.8h, v4.8h\n"
- "fmax v19.8h, v19.8h, v4.8h\n"
- "fmin v21.8h, v21.8h, v3.8h\n"
- "fmin v20.8h, v20.8h, v3.8h\n"
- "fmin v19.8h, v19.8h, v3.8h\n"
- "fmax v18.8h, v18.8h, v4.8h\n"
- "fmax v17.8h, v17.8h, v4.8h\n"
- "fmax v16.8h, v16.8h, v4.8h\n"
- "fmin v18.8h, v18.8h, v3.8h\n"
- "fmin v17.8h, v17.8h, v3.8h\n"
- "fmin v16.8h, v16.8h, v3.8h\n"
+ "fmin v27.8h, v27.8h, v1.8h\n"
+ "fmin v28.8h, v28.8h, v1.8h\n"
+ "fmin v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v1.8h\n"
+ "fmin v31.8h, v31.8h, v1.8h\n"
"tbz %x[n_channels], #2, 22f\n"
- "st1 { v25.d }[0], [x27], #0x8\n"
- "st1 { v24.d }[0], [x26], #0x8\n"
- "st1 { v22.d }[0], [x25], #0x8\n"
- "st1 { v21.d }[0], [x24], #0x8\n"
- "st1 { v20.d }[0], [x23], #0x8\n"
- "st1 { v19.d }[0], [x22], #0x8\n"
- "st1 { v18.d }[0], [x21], #0x8\n"
- "st1 { v17.d }[0], [x20], #0x8\n"
- "st1 { v16.d }[0], [x19], #0x8\n"
+ "st1 { v23.d }[0], [x28], #0x8\n"
+ "st1 { v24.d }[0], [x27], #0x8\n"
+ "st1 { v25.d }[0], [x26], #0x8\n"
+ "st1 { v26.d }[0], [x25], #0x8\n"
+ "st1 { v27.d }[0], [x24], #0x8\n"
+ "st1 { v28.d }[0], [x23], #0x8\n"
+ "st1 { v29.d }[0], [x22], #0x8\n"
+ "st1 { v30.d }[0], [x21], #0x8\n"
+ "st1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
- "st1 { v25.s }[2], [x27], #0x4\n"
- "st1 { v24.s }[2], [x26], #0x4\n"
- "st1 { v22.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v20.s }[2], [x23], #0x4\n"
- "st1 { v19.s }[2], [x22], #0x4\n"
- "st1 { v18.s }[2], [x21], #0x4\n"
- "st1 { v17.s }[2], [x20], #0x4\n"
- "st1 { v16.s }[2], [x19], #0x4\n"
+ "st1 { v23.s }[2], [x28], #0x4\n"
+ "st1 { v24.s }[2], [x27], #0x4\n"
+ "st1 { v25.s }[2], [x26], #0x4\n"
+ "st1 { v26.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x24], #0x4\n"
+ "st1 { v28.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "st1 { v31.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v25.h }[6], [x27], #0x2\n"
- "st1 { v24.h }[6], [x26], #0x2\n"
- "st1 { v22.h }[6], [x25], #0x2\n"
- "st1 { v21.h }[6], [x24], #0x2\n"
- "st1 { v20.h }[6], [x23], #0x2\n"
- "st1 { v19.h }[6], [x22], #0x2\n"
- "st1 { v18.h }[6], [x21], #0x2\n"
- "st1 { v17.h }[6], [x20], #0x2\n"
- "st1 { v16.h }[6], [x19], #0x2\n"
+ "st1 { v23.h }[6], [x28], #0x2\n"
+ "st1 { v24.h }[6], [x27], #0x2\n"
+ "st1 { v25.h }[6], [x26], #0x2\n"
+ "st1 { v26.h }[6], [x25], #0x2\n"
+ "st1 { v27.h }[6], [x24], #0x2\n"
+ "st1 { v28.h }[6], [x23], #0x2\n"
+ "st1 { v29.h }[6], [x22], #0x2\n"
+ "st1 { v30.h }[6], [x21], #0x2\n"
+ "st1 { v31.h }[6], [x20], #0x2\n"
"b 24f\n"
"21:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v25.h }[4], [x27], #0x2\n"
- "st1 { v24.h }[4], [x26], #0x2\n"
- "st1 { v22.h }[4], [x25], #0x2\n"
- "st1 { v21.h }[4], [x24], #0x2\n"
- "st1 { v20.h }[4], [x23], #0x2\n"
- "st1 { v19.h }[4], [x22], #0x2\n"
- "st1 { v18.h }[4], [x21], #0x2\n"
- "st1 { v17.h }[4], [x20], #0x2\n"
- "st1 { v16.h }[4], [x19], #0x2\n"
+ "st1 { v23.h }[4], [x28], #0x2\n"
+ "st1 { v24.h }[4], [x27], #0x2\n"
+ "st1 { v25.h }[4], [x26], #0x2\n"
+ "st1 { v26.h }[4], [x25], #0x2\n"
+ "st1 { v27.h }[4], [x24], #0x2\n"
+ "st1 { v28.h }[4], [x23], #0x2\n"
+ "st1 { v29.h }[4], [x22], #0x2\n"
+ "st1 { v30.h }[4], [x21], #0x2\n"
+ "st1 { v31.h }[4], [x20], #0x2\n"
"b 24f\n"
"22:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 23f\n"
- "st1 { v25.s }[0], [x27], #0x4\n"
- "st1 { v24.s }[0], [x26], #0x4\n"
- "st1 { v22.s }[0], [x25], #0x4\n"
- "st1 { v21.s }[0], [x24], #0x4\n"
- "st1 { v20.s }[0], [x23], #0x4\n"
- "st1 { v19.s }[0], [x22], #0x4\n"
- "st1 { v18.s }[0], [x21], #0x4\n"
- "st1 { v17.s }[0], [x20], #0x4\n"
- "st1 { v16.s }[0], [x19], #0x4\n"
+ "st1 { v23.s }[0], [x28], #0x4\n"
+ "st1 { v24.s }[0], [x27], #0x4\n"
+ "st1 { v25.s }[0], [x26], #0x4\n"
+ "st1 { v26.s }[0], [x25], #0x4\n"
+ "st1 { v27.s }[0], [x24], #0x4\n"
+ "st1 { v28.s }[0], [x23], #0x4\n"
+ "st1 { v29.s }[0], [x22], #0x4\n"
+ "st1 { v30.s }[0], [x21], #0x4\n"
+ "st1 { v31.s }[0], [x20], #0x4\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v25.h }[2], [x27], #0x2\n"
- "st1 { v24.h }[2], [x26], #0x2\n"
- "st1 { v22.h }[2], [x25], #0x2\n"
- "st1 { v21.h }[2], [x24], #0x2\n"
- "st1 { v20.h }[2], [x23], #0x2\n"
- "st1 { v19.h }[2], [x22], #0x2\n"
- "st1 { v18.h }[2], [x21], #0x2\n"
- "st1 { v17.h }[2], [x20], #0x2\n"
- "st1 { v16.h }[2], [x19], #0x2\n"
+ "st1 { v23.h }[2], [x28], #0x2\n"
+ "st1 { v24.h }[2], [x27], #0x2\n"
+ "st1 { v25.h }[2], [x26], #0x2\n"
+ "st1 { v26.h }[2], [x25], #0x2\n"
+ "st1 { v27.h }[2], [x24], #0x2\n"
+ "st1 { v28.h }[2], [x23], #0x2\n"
+ "st1 { v29.h }[2], [x22], #0x2\n"
+ "st1 { v30.h }[2], [x21], #0x2\n"
+ "st1 { v31.h }[2], [x20], #0x2\n"
"b 24f\n"
"23:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 24f\n"
- "st1 { v25.h }[0], [x27], #0x2\n"
- "st1 { v24.h }[0], [x26], #0x2\n"
- "st1 { v22.h }[0], [x25], #0x2\n"
- "st1 { v21.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x23], #0x2\n"
- "st1 { v19.h }[0], [x22], #0x2\n"
- "st1 { v18.h }[0], [x21], #0x2\n"
- "st1 { v17.h }[0], [x20], #0x2\n"
- "st1 { v16.h }[0], [x19], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
"24:" // Oddments: Store: Bit 2: End
"25:" // End
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index d9fc1403b2..f246cec87e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,1001 +45,998 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ld1r { v7.8h }, [%x[minmax_vals]]\n"
+ "lsr x11, %x[n_output_channels], #0x3\n"
+ "add x20, %x[minmax_vals], #0x2\n"
+ "ld1r { v6.8h }, [x20]\n"
"mov x10, #0x0\n"
- "add x19, %x[minmax_vals], #0x2\n"
- "ld1r { v6.8h }, [x19]\n"
- "lsr x9, %x[n_output_channels], #0x3\n"
- "cbz x9, 8f\n"
+ "cbz x11, 8f\n"
"1:" // Output channel loop
- "movi v16.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x10, #0x1\n"
- "ldr q16, [%x[bias], x19]\n"
+ "lsl x20, x10, #0x1\n"
+ "ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "mov v5.16b, v16.16b\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "mov x19, %x[inptrs]\n"
- "mov v31.16b, v16.16b\n"
- "ldp x25, x28, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov v30.16b, v16.16b\n"
- "ldr q3, [x25, #0x0]\n"
- "mov v29.16b, v16.16b\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
- "mov v28.16b, v16.16b\n"
- "ldr q2, [x28, #0x0]\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
- "mov v18.16b, v16.16b\n"
- "mov v17.16b, v16.16b\n"
- "cbz x20, 6f\n"
- "ldp x25, x28, [x19], #0x10\n"
- "ldr q16, [%x[weights], #0x0]\n"
- "subs x20, x20, #0x1\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x21, 6f\n"
+ "ldr q2, [%x[weights], #0x0]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x25, #0x0]\n"
- "ldr q0, [x28, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q0, [x9, #0x0]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr q3, [x25, #0x0]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "ldr q2, [x28, #0x0]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "ldr q1, [x25, #0x0]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
- "ldr q0, [x28, #0x0]\n"
- "ldr q16, [%x[weights], #0x10]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q2, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "lsl x27, x10, #0x1\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
- "fmin v5.8h, v5.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmax v5.8h, v5.8h, v7.8h\n"
- "str q5, [x19, x27]\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q31, [x20, x27]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q30, [x21, x27]\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "str q29, [x22, x27]\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "str q28, [x23, x27]\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q27, [x24, x27]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "str q26, [x25, x27]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q25, [x26, x27]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "str q24, [x19, x27]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "str q23, [x20, x27]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "str q22, [x21, x27]\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "str q21, [x22, x27]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "str q20, [x23, x27]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "lsl x28, x10, #0x1\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
+ "fmin v16.8h, v16.8h, v6.8h\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
"fmin v17.8h, v17.8h, v6.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q19, [x24, x27]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "fmax v16.8h, v16.8h, v7.8h\n"
"fmax v17.8h, v17.8h, v7.8h\n"
- "str q18, [x25, x27]\n"
- "str q17, [x26, x27]\n"
+ "str q16, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q17, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "str q18, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q19, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q20, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q21, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q22, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "str q23, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "lsl x27, x10, #0x1\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr q3, [x25, #0x0]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "ldr q2, [x28, #0x0]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "ldr q4, [%x[weights], #0x0]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "lsl x28, x10, #0x1\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "fmin v5.8h, v5.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmax v5.8h, v5.8h, v7.8h\n"
- "str q5, [x19, x27]\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q31, [x20, x27]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q30, [x21, x27]\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "str q29, [x22, x27]\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "str q28, [x23, x27]\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q27, [x24, x27]\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "str q26, [x25, x27]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q25, [x26, x27]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "str q24, [x19, x27]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "str q23, [x20, x27]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "str q22, [x21, x27]\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "str q21, [x22, x27]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "str q20, [x23, x27]\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmin v16.8h, v16.8h, v6.8h\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
"fmin v17.8h, v17.8h, v6.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q19, [x24, x27]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "fmax v16.8h, v16.8h, v7.8h\n"
"fmax v17.8h, v17.8h, v7.8h\n"
- "str q18, [x25, x27]\n"
- "str q17, [x26, x27]\n"
- "b 7f\n"
- "6:" // Output channel loop: Single kernel point
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "lsl x27, x10, #0x1\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "fmin v5.8h, v5.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmax v5.8h, v5.8h, v7.8h\n"
- "str q5, [x19, x27]\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "str q31, [x20, x27]\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "str q30, [x21, x27]\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "str q29, [x22, x27]\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "str q28, [x23, x27]\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "str q27, [x24, x27]\n"
+ "str q16, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q17, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "str q18, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q19, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"fmin v24.8h, v24.8h, v6.8h\n"
- "str q26, [x25, x27]\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q20, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q21, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q22, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "str q23, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
"fmax v24.8h, v24.8h, v7.8h\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "str q25, [x26, x27]\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "str q24, [x19, x27]\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "str q23, [x20, x27]\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "str q22, [x21, x27]\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "str q21, [x22, x27]\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
- "str q20, [x23, x27]\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmin v16.8h, v16.8h, v6.8h\n"
+ "lsl x28, x10, #0x1\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
"fmin v17.8h, v17.8h, v6.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "str q19, [x24, x27]\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmax v16.8h, v16.8h, v7.8h\n"
"fmax v17.8h, v17.8h, v7.8h\n"
- "str q18, [x25, x27]\n"
- "str q17, [x26, x27]\n"
+ "str q16, [x20, x28]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q17, [x21, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "str q18, [x22, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q19, [x23, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q20, [x24, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q21, [x25, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q22, [x26, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "str q23, [x27, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x8\n"
- "cmp x10, x9, LSL #3\n"
+ "cmp x10, x11, LSL #3\n"
"blt 1b\n"
"tst %x[n_output_channels], #0x7\n"
"beq 23f\n"
"8:" // Output channel oddments
- "movi v16.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"cbz %x[bias], 13f\n"
- "add x19, %x[bias], x10, LSL #1\n"
+ "add x20, %x[bias], x10, LSL #1\n"
"tbz %x[n_output_channels], #2, 10f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #1, 9f\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v31.s }[2], [x20], #0x4\n"
"tbz %x[n_output_channels], #0, 12f\n"
- "ld1 { v16.h }[6], [x19]\n"
+ "ld1 { v31.h }[6], [x20]\n"
"b 12f\n"
"9:" // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
"tbz %x[n_output_channels], #0, 12f\n"
- "ld1 { v16.h }[4], [x19]\n"
+ "ld1 { v31.h }[4], [x20]\n"
"b 12f\n"
"10:" // Output channel oddments: Load bias: Bit 2: Unset
"tbz %x[n_output_channels], #1, 11f\n"
- "ld1 { v16.s }[0], [x19], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz %x[n_output_channels], #0, 12f\n"
- "ld1 { v16.h }[2], [x19]\n"
+ "ld1 { v31.h }[2], [x20]\n"
"b 12f\n"
"11:" // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 12f\n"
- "ld1 { v16.h }[0], [x19]\n"
+ "ld1 { v31.h }[0], [x20]\n"
"12:" // Output channel oddments: Load bias: Bit 2: End
-
"13:" // Output channel oddments: Load bias: Done
- "mov v5.16b, v16.16b\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "mov x19, %x[inptrs]\n"
- "mov v31.16b, v16.16b\n"
- "ldp x25, x28, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov v30.16b, v16.16b\n"
- "ldr q3, [x25, #0x0]\n"
- "mov v29.16b, v16.16b\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
- "mov v28.16b, v16.16b\n"
- "ldr q2, [x28, #0x0]\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
- "mov v18.16b, v16.16b\n"
- "mov v17.16b, v16.16b\n"
- "cbz x20, 17f\n"
- "ldp x25, x28, [x19], #0x10\n"
- "ldr q16, [%x[weights], #0x0]\n"
- "subs x20, x20, #0x1\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x21, 17f\n"
+ "ldr q2, [%x[weights], #0x0]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q1, [x25, #0x0]\n"
- "ldr q0, [x28, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q0, [x9, #0x0]\n"
"beq 15f\n"
"14:" // Output channel oddments: Kernel loop
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr q3, [x25, #0x0]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "ldr q2, [x28, #0x0]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "ldr q4, [%x[weights], #0x0]\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "ldr q1, [x25, #0x0]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
- "ldr q0, [x28, #0x0]\n"
- "ldr q16, [%x[weights], #0x10]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q2, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 14b\n"
"15:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 16f\n"
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
"b 18f\n"
"16:" // Output channel oddments: Odd tail
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "ldp x25, x28, [x19], #0x10\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "ldr q3, [x25, #0x0]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
- "ldr q2, [x28, #0x0]\n"
- "fmla v5.8h, v16.8h, v1.h[0]\n"
- "ldr q4, [%x[weights], #0x0]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "ldp x24, x9, [x20], #0x10\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q3, [x9, #0x0]\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "fmla v16.8h, v2.8h, v1.h[0]\n"
+ "fmla v17.8h, v2.8h, v1.h[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v31.8h, v16.8h, v1.h[1]\n"
- "fmla v30.8h, v16.8h, v1.h[2]\n"
- "fmla v29.8h, v16.8h, v1.h[3]\n"
- "fmla v28.8h, v16.8h, v1.h[4]\n"
- "fmla v27.8h, v16.8h, v1.h[5]\n"
- "fmla v26.8h, v16.8h, v1.h[6]\n"
- "fmla v25.8h, v16.8h, v1.h[7]\n"
- "fmla v24.8h, v16.8h, v0.h[0]\n"
- "fmla v23.8h, v16.8h, v0.h[1]\n"
- "fmla v22.8h, v16.8h, v0.h[2]\n"
- "fmla v21.8h, v16.8h, v0.h[3]\n"
- "fmla v20.8h, v16.8h, v0.h[4]\n"
- "fmla v19.8h, v16.8h, v0.h[5]\n"
- "fmla v18.8h, v16.8h, v0.h[6]\n"
- "fmla v17.8h, v16.8h, v0.h[7]\n"
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmla v18.8h, v2.8h, v1.h[2]\n"
+ "fmla v19.8h, v2.8h, v1.h[3]\n"
+ "fmla v20.8h, v2.8h, v1.h[4]\n"
+ "fmla v21.8h, v2.8h, v1.h[5]\n"
+ "fmla v22.8h, v2.8h, v1.h[6]\n"
+ "fmla v23.8h, v2.8h, v1.h[7]\n"
+ "fmla v24.8h, v2.8h, v0.h[0]\n"
+ "fmla v25.8h, v2.8h, v0.h[1]\n"
+ "fmla v26.8h, v2.8h, v0.h[2]\n"
+ "fmla v27.8h, v2.8h, v0.h[3]\n"
+ "fmla v28.8h, v2.8h, v0.h[4]\n"
+ "fmla v29.8h, v2.8h, v0.h[5]\n"
+ "fmla v30.8h, v2.8h, v0.h[6]\n"
+ "fmla v31.8h, v2.8h, v0.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
"b 18f\n"
"17:" // Output channel oddments: Single kernel point
- "fmla v5.8h, v4.8h, v3.h[0]\n"
- "fmla v31.8h, v4.8h, v3.h[1]\n"
- "fmla v30.8h, v4.8h, v3.h[2]\n"
- "fmla v29.8h, v4.8h, v3.h[3]\n"
- "fmla v28.8h, v4.8h, v3.h[4]\n"
- "fmla v27.8h, v4.8h, v3.h[5]\n"
- "fmla v26.8h, v4.8h, v3.h[6]\n"
- "fmla v25.8h, v4.8h, v3.h[7]\n"
- "fmla v24.8h, v4.8h, v2.h[0]\n"
- "fmla v23.8h, v4.8h, v2.h[1]\n"
- "fmla v22.8h, v4.8h, v2.h[2]\n"
- "fmla v21.8h, v4.8h, v2.h[3]\n"
- "fmla v20.8h, v4.8h, v2.h[4]\n"
- "fmla v19.8h, v4.8h, v2.h[5]\n"
- "fmla v18.8h, v4.8h, v2.h[6]\n"
- "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
"18:" // Output channel oddments: Done
- "fmin v5.8h, v5.8h, v6.8h\n"
- "fmin v31.8h, v31.8h, v6.8h\n"
- "fmin v30.8h, v30.8h, v6.8h\n"
- "fmin v29.8h, v29.8h, v6.8h\n"
- "fmax v5.8h, v5.8h, v7.8h\n"
- "fmax v31.8h, v31.8h, v7.8h\n"
- "fmax v30.8h, v30.8h, v7.8h\n"
- "fmax v29.8h, v29.8h, v7.8h\n"
- "fmin v28.8h, v28.8h, v6.8h\n"
- "fmin v27.8h, v27.8h, v6.8h\n"
- "fmin v26.8h, v26.8h, v6.8h\n"
- "fmax v28.8h, v28.8h, v7.8h\n"
- "fmax v27.8h, v27.8h, v7.8h\n"
- "fmax v26.8h, v26.8h, v7.8h\n"
- "fmin v25.8h, v25.8h, v6.8h\n"
- "fmin v24.8h, v24.8h, v6.8h\n"
- "fmin v23.8h, v23.8h, v6.8h\n"
- "fmax v25.8h, v25.8h, v7.8h\n"
- "fmax v24.8h, v24.8h, v7.8h\n"
- "fmax v23.8h, v23.8h, v7.8h\n"
- "fmin v22.8h, v22.8h, v6.8h\n"
- "fmin v21.8h, v21.8h, v6.8h\n"
- "fmin v20.8h, v20.8h, v6.8h\n"
- "fmax v22.8h, v22.8h, v7.8h\n"
- "fmax v21.8h, v21.8h, v7.8h\n"
- "fmax v20.8h, v20.8h, v7.8h\n"
- "fmin v19.8h, v19.8h, v6.8h\n"
- "fmin v18.8h, v18.8h, v6.8h\n"
+ "fmin v16.8h, v16.8h, v6.8h\n"
"fmin v17.8h, v17.8h, v6.8h\n"
- "fmax v19.8h, v19.8h, v7.8h\n"
- "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "fmax v16.8h, v16.8h, v7.8h\n"
"fmax v17.8h, v17.8h, v7.8h\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
"tbz %x[n_output_channels], #2, 20f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.d }[0], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.d }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.d }[0], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.d }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.d }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.d }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.d }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.d }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.d }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.d }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.d }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.d }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.d }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.d }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.d }[0], [x25]\n"
+ "st1 { v23.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
"add x10, x10, #0x4\n"
- "st1 { v17.d }[0], [x26]\n"
+ "st1 { v24.d }[0], [x20]\n"
+ "st1 { v25.d }[0], [x21]\n"
+ "st1 { v26.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x23]\n"
+ "st1 { v28.d }[0], [x24]\n"
+ "st1 { v29.d }[0], [x25]\n"
+ "st1 { v30.d }[0], [x26]\n"
+ "st1 { v31.d }[0], [x27]\n"
"tbz %x[n_output_channels], #1, 19f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.s }[2], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.s }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.s }[2], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.s }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.s }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.s }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.s }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.s }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.s }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.s }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.s }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.s }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.s }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.s }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.s }[2], [x25]\n"
+ "st1 { v23.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v17.s }[2], [x26]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x26]\n"
+ "st1 { v31.s }[2], [x27]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.h }[6], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.h }[6], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.h }[6], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.h }[6], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.h }[6], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.h }[6], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.h }[6], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.h }[6], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.h }[6], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.h }[6], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[6], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.h }[6], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[6], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.h }[6], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.h }[6], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[6], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.h }[6], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.h }[6], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[6], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.h }[6], [x25]\n"
- "st1 { v17.h }[6], [x26]\n"
+ "st1 { v23.h }[6], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v24.h }[6], [x20]\n"
+ "st1 { v25.h }[6], [x21]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "st1 { v28.h }[6], [x24]\n"
+ "st1 { v29.h }[6], [x25]\n"
+ "st1 { v30.h }[6], [x26]\n"
+ "st1 { v31.h }[6], [x27]\n"
"b 22f\n"
"19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.h }[4], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.h }[4], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.h }[4], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.h }[4], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.h }[4], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.h }[4], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.h }[4], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.h }[4], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.h }[4], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.h }[4], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[4], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.h }[4], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[4], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.h }[4], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.h }[4], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[4], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.h }[4], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.h }[4], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[4], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.h }[4], [x25]\n"
- "st1 { v17.h }[4], [x26]\n"
+ "st1 { v23.h }[4], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v24.h }[4], [x20]\n"
+ "st1 { v25.h }[4], [x21]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "st1 { v28.h }[4], [x24]\n"
+ "st1 { v29.h }[4], [x25]\n"
+ "st1 { v30.h }[4], [x26]\n"
+ "st1 { v31.h }[4], [x27]\n"
"b 22f\n"
"20:" // Output channel oddments: Done: Store: Bit 2: Unset
"tbz %x[n_output_channels], #1, 21f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.s }[0], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.s }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.s }[0], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.s }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.s }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.s }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.s }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.s }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.s }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.s }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.s }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.s }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.s }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.s }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.s }[0], [x25]\n"
+ "st1 { v23.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
"add x10, x10, #0x2\n"
- "st1 { v17.s }[0], [x26]\n"
+ "st1 { v24.s }[0], [x20]\n"
+ "st1 { v25.s }[0], [x21]\n"
+ "st1 { v26.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x23]\n"
+ "st1 { v28.s }[0], [x24]\n"
+ "st1 { v29.s }[0], [x25]\n"
+ "st1 { v30.s }[0], [x26]\n"
+ "st1 { v31.s }[0], [x27]\n"
"tbz %x[n_output_channels], #0, 22f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.h }[2], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.h }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.h }[2], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.h }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.h }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.h }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.h }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.h }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.h }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.h }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.h }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.h }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.h }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.h }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.h }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.h }[2], [x25]\n"
- "st1 { v17.h }[2], [x26]\n"
+ "st1 { v23.h }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v24.h }[2], [x20]\n"
+ "st1 { v25.h }[2], [x21]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "st1 { v28.h }[2], [x24]\n"
+ "st1 { v29.h }[2], [x25]\n"
+ "st1 { v30.h }[2], [x26]\n"
+ "st1 { v31.h }[2], [x27]\n"
"b 22f\n"
"21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 22f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #1\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v5.h }[0], [x19]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v31.h }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v30.h }[0], [x21]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v29.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v28.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v27.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v26.h }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #1\n"
- "st1 { v25.h }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v16.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #1\n"
- "st1 { v24.h }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #1\n"
- "st1 { v23.h }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #1\n"
- "st1 { v22.h }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #1\n"
- "st1 { v21.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #1\n"
- "st1 { v20.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #1\n"
- "st1 { v19.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #1\n"
- "st1 { v18.h }[0], [x25]\n"
- "st1 { v17.h }[0], [x26]\n"
+ "st1 { v23.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v24.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x21]\n"
+ "st1 { v26.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x23]\n"
+ "st1 { v28.h }[0], [x24]\n"
+ "st1 { v29.h }[0], [x25]\n"
+ "st1 { v30.h }[0], [x26]\n"
+ "st1 { v31.h }[0], [x27]\n"
"22:" // Output channel oddments: Done: Store: Bit 2: End
"23:" // Done
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 73c1e07420..2ff03aa15a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,438 +87,438 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
- "1:" // Tile loop
- "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x25, #0x2\n"
- "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x15, #0x2\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x24, %x[params_struct], %[offsetof_args_min]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "add x21, %x[params_struct], %[offsetof_args_max]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x23, #0x0\n"
"mov x22, #0x0\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
- "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
- "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x12, x12, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1r { v18.4s }, [x24]\n"
- "add x9, x12, x23, LSL #2\n"
- "ld1r { v17.4s }, [x21]\n"
- "add x28, x9, x23, LSL #2\n"
- "lsl x13, x13, #0x2\n"
- "add x27, x28, x23, LSL #2\n"
- "add x26, x13, x13\n"
- "add x25, x26, x13\n"
- "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
- "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x15\n" // offset *= output_tile_size
- "add x10, x10, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x24, x10, x20, LSL #2\n"
- "lsl x11, x11, #0x2\n"
- "mov x21, #0x10\n" // cntb _, ALL, #1
- "sub x20, XZR, x21\n"
- "lsr x19, %x[n_channels], #0x2\n"
- "cbz x19, 4f\n"
- "ldr q16, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "cmp x21, x19, LSL #4\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
- "ldr q9, [x9, x13]\n"
- "ld1 { v10.4s }, [x12]\n"
- "ldr q11, [x12, x25]\n"
- "ldr q12, [x9, x26]\n"
- "ldr q13, [x28, x13]\n"
+ "1:" // Tile loop
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x2\n"
+ "mov x26, #0x2\n"
+ "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x15, x15, #0x2\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x11, x15, x15\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x9, x13, x25, LSL #2\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x28, x9, x25, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x27, x28, x25, LSL #2\n"
+ "add x26, x11, x15\n"
+ "add x25, x12, x24, LSL #2\n"
+ "lsl x14, x14, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q16, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "add x10, x10, #0xa0\n"
+ "ldr q9, [x9, x15]\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "ldr q11, [x13, x26]\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q13, [x28, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "add x23, x23, #0x10\n"
+ "cmp x23, x22, LSL #4\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x27]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x11]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x27, x26]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
"add x20, x20, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "add x22, x22, #0x10\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
"add x21, x21, #0x10\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ld1 { v9.4s }, [x27]\n"
- "cmp x21, x19, LSL #4\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x25]\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "ldr q16, [x14, #0x0]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v29.4s, v6.4s, v9.4s\n"
- "ldr q9, [x12, x26]\n"
- "add x12, x12, #0x10\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x13, x15]\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x13, x11]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
"ld1 { v11.4s }, [x9]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x25]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "ldr q4, [x14, #0x50]\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x10, #0x50]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
"ld1 { v9.4s }, [x28]\n"
- "ldr q1, [x14, #0x20]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q0, [x14, #0x10]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr q2, [x14, #0x30]\n"
- "fmla v31.4s, v8.4s, v10.4s\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x25]\n"
+ "ldr q1, [x10, #0x20]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x10, #0x10]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x10, #0x30]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
- "ldr q13, [x28, x13]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
- "add x27, x27, #0x10\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q5, [x14, #0x60]\n"
- "fmla v31.4s, v6.4s, v9.4s\n"
- "ldr q9, [x9, x13]\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ld1 { v10.4s }, [x12]\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "ldr q11, [x12, x25]\n"
- "ldr q6, [x14, #0x70]\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "ldr q8, [x14, #0x90]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "ldr q7, [x14, #0x80]\n"
- "add x14, x14, #0xa0\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "st1 { v31.4s }, [x10]\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr q13, [x28, x15]\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x11]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x13, x26]\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x9, x15]\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q7, [x10, #0x80]\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "str q30, [x10, x11]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v29.4s }, [x24]\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr q8, [x10, #0x90]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "add x27, x27, #0x10\n"
"fmin v28.4s, v28.4s, v17.4s\n"
- "add x10, x10, #0x10\n"
- "str q28, [x24, x11]\n"
- "add x24, x24, #0x10\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "st1 { v28.4s }, [x12]\n"
+ "add x10, x10, #0xa0\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q29, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v30.4s }, [x25]\n"
+ "str q31, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"ld1 { v9.4s }, [x27]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x26]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "ldr q11, [x27, x25]\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "ldr q12, [x12, x13]\n"
- "fmla v29.4s, v6.4s, v9.4s\n"
- "ldr q9, [x12, x26]\n"
- "add x12, x12, #0x10\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ld1 { v11.4s }, [x9]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x11]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x27, x26]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x25]\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x13, x15]\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x13, x11]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x9]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x26]\n"
"add x9, x9, #0x10\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
"ld1 { v9.4s }, [x28]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v8.4s, v10.4s\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "ldr q10, [x28, x25]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x28, x26]\n"
"add x28, x28, #0x10\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x27, x13]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x27, x26]\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x11]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
"add x27, x27, #0x10\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v6.4s, v9.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "st1 { v31.4s }, [x10]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q30, [x10, x11]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "add x10, x10, #0x10\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "st1 { v29.4s }, [x24]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
- "str q28, [x24, x11]\n"
- "add x24, x24, #0x10\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "st1 { v28.4s }, [x12]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q29, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v30.4s }, [x25]\n"
+ "str q31, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 31f\n"
- "ldr q16, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "add x23, x9, x13\n"
- "ldr q1, [x14, #0x20]\n"
- "add x22, x12, XZR\n"
- "ldr q2, [x14, #0x30]\n"
- "add x21, x12, x25\n"
- "ldr q3, [x14, #0x40]\n"
- "add x20, x9, x26\n"
- "ldr q4, [x14, #0x50]\n"
- "add x19, x28, x13\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
+ "ldr q16, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "add x24, x9, x15\n"
+ "add x23, x13, XZR\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "add x22, x13, x26\n"
+ "add x21, x9, x11\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "add x20, x28, x15\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
- "ldr d9, [x23], #0x8\n"
- "ldr d10, [x22], #0x8\n"
- "ldr d11, [x21], #0x8\n"
- "ldr d12, [x20], #0x8\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v9.s }[2], [x23]\n"
- "ld1 { v10.s }[2], [x22]\n"
- "ld1 { v11.s }[2], [x21]\n"
- "ld1 { v12.s }[2], [x20]\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x24]\n"
+ "ld1 { v10.s }[2], [x23]\n"
+ "ld1 { v11.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x21]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 6f\n"
"5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
- "ldr s9, [x23, #0x0]\n"
- "ldr s10, [x22, #0x0]\n"
- "ldr s11, [x21, #0x0]\n"
- "ldr s12, [x20, #0x0]\n"
- "ldr s13, [x19, #0x0]\n"
+ "ldr s9, [x24, #0x0]\n"
+ "ldr s10, [x23, #0x0]\n"
+ "ldr s11, [x22, #0x0]\n"
+ "ldr s12, [x21, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "add x19, x27, XZR\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "add x20, x27, XZR\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
- "fmla v29.4s, v6.4s, v9.4s\n"
- "add x19, x27, x25\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x27, x26\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 10f\n"
"9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
- "fmla v28.4s, v8.4s, v11.4s\n"
- "add x19, x12, x13\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x13, x15\n"
"tbz %x[n_channels], #1, 11f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v12.4s\n"
- "add x19, x12, x26\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "add x20, x13, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 14f\n"
"13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v9.4s\n"
- "add x19, x28, x26\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "add x20, x28, x11\n"
"tbz %x[n_channels], #1, 15f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
- "fmla v31.4s, v8.4s, v10.4s\n"
- "add x19, x9, XZR\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x9, XZR\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
"tbz %x[n_channels], #1, 17f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 18f\n"
"17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v11.4s\n"
- "add x19, x9, x25\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x9, x26\n"
"tbz %x[n_channels], #1, 19f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "add x19, x28, XZR\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "add x20, x28, XZR\n"
"tbz %x[n_channels], #1, 21f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 22f\n"
"21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v9.4s\n"
- "add x19, x28, x25\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x20, x28, x26\n"
"tbz %x[n_channels], #1, 23f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
- "fmla v30.4s, v8.4s, v10.4s\n"
- "add x19, x27, x13\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "add x20, x27, x15\n"
"tbz %x[n_channels], #1, 25f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 26f\n"
"25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "fmla v29.4s, v7.4s, v11.4s\n"
- "add x19, x27, x26\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "add x20, x27, x11\n"
"tbz %x[n_channels], #1, 27f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 29f\n"
- "mov x19, x10\n"
- "st1 { v31.d }[0], [x19], x11\n"
- "add x10, x10, #0x8\n"
- "st1 { v30.d }[0], [x19]\n"
- "mov x19, x24\n"
- "st1 { v29.d }[0], [x19], x11\n"
- "add x24, x24, #0x8\n"
- "st1 { v28.d }[0], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
+ "add x12, x12, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 30f\n"
- "mov x20, x10\n"
- "st1 { v31.s }[2], [x20], x11\n"
- "mov x19, x24\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v29.s }[2], [x19], x11\n"
- "st1 { v28.s }[2], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[2], [x21], x14\n"
+ "st1 { v30.s }[2], [x20], x14\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 30f\n"
"29:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x20, x10\n"
- "st1 { v31.s }[0], [x20], x11\n"
- "mov x19, x24\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v29.s }[0], [x19], x11\n"
- "st1 { v28.s }[0], [x19]\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[0], [x21], x14\n"
+ "st1 { v30.s }[0], [x20], x14\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"30:" // Tile loop: Oddments: Store: Bit 1: End
"31:" // Tile loop: End
- "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x17, #0x1\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x22, x22, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x22, x20\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "add x16, x16, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x16, x19\n"
- "csel x16, x16, XZR, LT\n"
- "csel x17, x17, x21, LT\n"
- "cmp x17, x20\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x22, x22, XZR, LT\n"
+ "cmp x23, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 42931fba17..56e9ed2e1b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,283 +79,283 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x16, #0x10\n" // cntb _, ALL, #1
+ "lsr x15, %x[n_channels], #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.4s }, [x20]\n"
- "ld1r { v17.4s }, [x19]\n"
- "mov x14, #0x0\n"
- "ldp x13, x12, [x21, #0x0]\n"
- "mov x11, #0x10\n" // cntb _, ALL, #1
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
- "sub x28, XZR, x11\n"
- "lsr x27, %x[n_channels], #0x2\n"
- "cbz x27, 3f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x11, x27, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x26, x25, [x16, #0x0]\n"
- "ldp x24, x23, [x16, #0x10]\n"
- "ldr x22, [x16, #0x20]\n"
- "ldr q9, [x26, x14]\n"
- "ldr q10, [x25, x14]\n"
- "ldr q11, [x24, x14]\n"
- "ldr q12, [x23, x14]\n"
- "ldr q13, [x22, x14]\n"
+ "mov x28, #0x0\n"
+ "sub x27, XZR, x16\n"
+ "cbz x15, 3f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x16, x15, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldp x26, x22, [x13, #0x0]\n"
+ "ldr q9, [x26, x28]\n"
+ "ldr q10, [x22, x28]\n"
+ "ldp x25, x24, [x13, #0x10]\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr q13, [x23, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "ldr x21, [x16, #0x28]\n"
- "add x28, x28, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "ldr x20, [x16, #0x30]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x19, [x16, #0x38]\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x21, x14]\n"
- "ldr x26, [x16, #0x40]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "ldr q11, [x20, x14]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q10, [x25, x14]\n"
- "ldr x24, [x16, #0x50]\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v6.4s, v9.4s\n"
- "ldr q12, [x19, x14]\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "ldr q9, [x26, x14]\n"
- "ldr x22, [x16, #0x60]\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "ldr x20, [x16, #0x70]\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "ldr x19, [x16, #0x78]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr x22, [x13, #0x28]\n"
+ "ldr x21, [x13, #0x30]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr q16, [x14, #0x0]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr x20, [x13, #0x38]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "ldp x26, x25, [x16, #0x0]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "ldp x24, x23, [x16, #0x10]\n"
- "ldr q16, [x15, #0x0]\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q9, [x22, x14]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "ldr x22, [x16, #0x20]\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v31.4s, v8.4s, v10.4s\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "ldr q10, [x21, x14]\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
- "ldr q13, [x22, x11]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x20, x14]\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "ldr q12, [x19, x14]\n"
- "add x14, x14, #0x10\n"
- "fmla v31.4s, v6.4s, v9.4s\n"
- "ldr q9, [x26, x11]\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr q10, [x25, x11]\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "ldr q11, [x24, x11]\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "ldr q6, [x15, #0x70]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "ldr q8, [x15, #0x90]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "ldr q12, [x23, x11]\n"
- "add x11, x11, #0x10\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "cmp x11, x27, LSL #4\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "str q31, [x13, x28]\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "add x15, x15, #0xa0\n"
+ "ldr x22, [x13, #0x48]\n"
+ "ldr q10, [x22, x28]\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x26, [x13, #0x40]\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x26, x28]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "ldr x25, [x13, #0x50]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "ldr x24, [x13, #0x58]\n"
+ "ldr x23, [x13, #0x60]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr x22, [x13, #0x68]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr q9, [x23, x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x22, x28]\n"
+ "ldp x26, x22, [x13, #0x0]\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "ldp x25, x24, [x13, #0x10]\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr q13, [x23, x16]\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x28]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x25, x16]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "ldr q9, [x26, x16]\n"
+ "ldr q10, [x22, x16]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x24, x16]\n"
+ "ldr q6, [x14, #0x70]\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "str q30, [x12, x28]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "str q29, [x10, x28]\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "add x16, x16, #0x10\n"
+ "add x27, x27, #0x10\n"
"fmin v28.4s, v28.4s, v17.4s\n"
- "str q28, [x9, x28]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "cmp x16, x15, LSL #4\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "add x28, x28, #0x10\n"
+ "str q28, [x12, x27]\n"
+ "add x14, x14, #0xa0\n"
+ "str q29, [x11, x27]\n"
+ "str q30, [x10, x27]\n"
+ "str q31, [x9, x27]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "ldr x21, [x16, #0x28]\n"
- "add x28, x28, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "ldr x20, [x16, #0x30]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "ldr x19, [x16, #0x38]\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q9, [x21, x14]\n"
- "ldr x26, [x16, #0x40]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "ldr q11, [x20, x14]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q10, [x25, x14]\n"
- "ldr x24, [x16, #0x50]\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v6.4s, v9.4s\n"
- "ldr q12, [x19, x14]\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "ldr q9, [x26, x14]\n"
- "ldr x22, [x16, #0x60]\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "ldr x20, [x16, #0x70]\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "ldr x19, [x16, #0x78]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr x22, [x13, #0x28]\n"
+ "ldr x21, [x13, #0x30]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr x20, [x13, #0x38]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr x22, [x13, #0x48]\n"
+ "ldr q10, [x22, x28]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v31.4s, v2.4s, v9.4s\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q9, [x22, x14]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "fmla v31.4s, v8.4s, v10.4s\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "ldr q10, [x21, x14]\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q11, [x20, x14]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x19, x14]\n"
- "add x14, x14, #0x10\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "fmla v31.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q31, [x13, x28]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q30, [x12, x28]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "ldr x26, [x13, #0x40]\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x25, [x13, #0x50]\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x26, x28]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "ldr x24, [x13, #0x58]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "ldr x23, [x13, #0x60]\n"
+ "ldr x22, [x13, #0x68]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x25, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x24, x28]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr q9, [x23, x28]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x22, x28]\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x28]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "add x28, x28, #0x10\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "str q29, [x10, x28]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
- "str q28, [x9, x28]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q28, [x12, x27]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q29, [x11, x27]\n"
+ "str q30, [x10, x27]\n"
+ "str q31, [x9, x27]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 30f\n"
- "ldr q16, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x28, x14\n"
- "ldr q1, [x15, #0x20]\n"
- "add x13, x13, x28\n"
- "ldr q2, [x15, #0x30]\n"
- "add x12, x12, x28\n"
- "ldr q3, [x15, #0x40]\n"
- "add x10, x10, x28\n"
- "ldr q4, [x15, #0x50]\n"
- "add x9, x9, x28\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x26, [x16, #0x0]\n"
- "ldr x25, [x16, #0x8]\n"
- "add x26, x26, x14\n"
- "ldr x24, [x16, #0x10]\n"
- "ldr x23, [x16, #0x18]\n"
- "add x25, x25, x14\n"
- "ldr x22, [x16, #0x20]\n"
- "add x24, x24, x14\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "mov x27, x28\n"
+ "add x12, x12, x27\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x11, x11, x27\n"
+ "add x10, x10, x27\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x9, x9, x27\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "ldr x24, [x13, #0x0]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ "ldr x22, [x13, #0x10]\n"
+ "ldr x21, [x13, #0x18]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [x13, #0x20]\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 5f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 5f\n"
"4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
- "ld1 { v9.s }[0], [x26], #0x4\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
- "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "ldr x21, [x16, #0x28]\n"
- "add x21, x21, x14\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v12.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x13, #0x28]\n"
+ "add x20, x20, x28\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v9.d }[0], [x21], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v9.s }[2], [x21], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load input (3, 0): Bit 1: Unset
- "ld1 { v9.s }[0], [x21], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (3, 0): Bit 1: End
- "fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x20, [x16, #0x30]\n"
- "fmla v31.4s, v7.4s, v13.4s\n"
- "add x20, x20, x14\n"
- "fmla v30.4s, v6.4s, v13.4s\n"
- "fmla v28.4s, v3.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x13, #0x30]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x28\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -364,95 +364,95 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (3, 3): Bit 1: End
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr x19, [x16, #0x38]\n"
- "add x19, x19, x14\n"
+ "ldr x20, [x13, #0x38]\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Load input (0, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x26, [x16, #0x40]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "add x26, x26, x14\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 13f\n"
"12:" // Oddments: Load input (0, 2): Bit 1: Unset
- "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v9.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 15f\n"
"14:" // Oddments: Load input (2, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 2): Bit 1: End
- "fmla v31.4s, v8.4s, v10.4s\n"
- "ldr x24, [x16, #0x50]\n"
- "fmla v30.4s, v7.4s, v10.4s\n"
- "add x24, x24, x14\n"
- "fmla v29.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x28\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: Load input (1, 0): Bit 1: Unset
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "add x23, x23, x14\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 19f\n"
"18:" // Oddments: Load input (1, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr x22, [x16, #0x60]\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
- "add x22, x22, x14\n"
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 21f\n"
"20:" // Oddments: Load input (2, 0): Bit 1: Unset
- "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v9.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v29.4s, v3.4s, v9.4s\n"
- "add x21, x21, x14\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 23f\n"
"22:" // Oddments: Load input (2, 3): Bit 1: Unset
- "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (2, 3): Bit 1: End
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr x20, [x16, #0x70]\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "add x20, x20, x14\n"
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -461,51 +461,49 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 1): Bit 1: End
- "fmla v29.4s, v7.4s, v11.4s\n"
- "ldr x19, [x16, #0x78]\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "add x19, x19, x14\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 27f\n"
"26:" // Oddments: Load input (3, 2): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (3, 2): Bit 1: End
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 28f\n"
- "st1 { v31.d }[0], [x13], #0x8\n"
- "st1 { v30.d }[0], [x12], #0x8\n"
- "st1 { v29.d }[0], [x10], #0x8\n"
- "st1 { v28.d }[0], [x9], #0x8\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
- "st1 { v31.s }[2], [x13], #0x4\n"
- "st1 { v30.s }[2], [x12], #0x4\n"
- "st1 { v29.s }[2], [x10], #0x4\n"
- "st1 { v28.s }[2], [x9], #0x4\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
"b 29f\n"
"28:" // Oddments: Store: Bit 1: Unset
- "st1 { v31.s }[0], [x13], #0x4\n"
- "st1 { v30.s }[0], [x12], #0x4\n"
- "st1 { v29.s }[0], [x10], #0x4\n"
- "st1 { v28.s }[0], [x9], #0x4\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
"29:" // Oddments: Store: Bit 1: End
-
"30:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 6d185e7274..620319bc7c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,739 +87,739 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x7, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x24, #0x0\n"
+ "mov x23, #0x0\n"
"1:" // Tile loop
- "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x3\n"
"mov x26, #0x3\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x25, #0x3\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x24, %x[params_struct], %[offsetof_args_min]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "add x21, %x[params_struct], %[offsetof_args_max]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mov x23, #0x0\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
- "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
- "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x15, x15, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1r { v18.4s }, [x24]\n"
- "add x12, x15, x22, LSL #2\n"
- "ld1r { v17.4s }, [x21]\n"
- "add x11, x12, x22, LSL #2\n"
- "lsl x16, x16, #0x2\n"
- "add x10, x11, x22, LSL #2\n"
- "add x9, x10, x22, LSL #2\n"
- "add x28, x16, x16\n"
- "add x27, x28, x16\n"
- "add x26, x27, x16\n"
- "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
- "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x25\n" // offset *= output_tile_size
- "add x13, x13, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x25, x13, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "lsl x14, x14, #0x2\n"
- "add x22, x14, x14\n"
- "mov x21, #0x10\n" // cntb _, ALL, #1
- "sub x20, XZR, x21\n"
- "lsr x19, %x[n_channels], #0x2\n"
- "cbz x19, 4f\n"
- "ldr q16, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
- "cmp x21, x19, LSL #4\n"
- "ldr q1, [x17, #0x20]\n"
- "ldr q2, [x17, #0x30]\n"
- "ldr q3, [x17, #0x40]\n"
- "ldr q4, [x17, #0x50]\n"
- "ldr q5, [x17, #0x60]\n"
- "ldr q6, [x17, #0x70]\n"
- "ldr q7, [x17, #0x80]\n"
- "ldr q8, [x17, #0x90]\n"
- "add x17, x17, #0xa0\n"
- "ldr q9, [x11, x28]\n"
- "ld1 { v10.4s }, [x15]\n"
- "ldr q11, [x15, x26]\n"
- "ld1 { v12.4s }, [x9]\n"
- "ldr q13, [x12, x28]\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
+ "mov x24, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x8, x8, #0x2\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
+ "lsl x17, x17, #0x2\n"
+ "lsr x23, %x[n_channels], #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x16, x25, LSL #2\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x12, x13, x25, LSL #2\n"
+ "add x11, x8, x8\n"
+ "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x10, x12, x25, LSL #2\n"
+ "add x9, x11, x8\n"
+ "add x28, x15, x22, LSL #2\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x27, x10, x25, LSL #2\n"
+ "add x26, x9, x8\n"
+ "add x25, x28, x22, LSL #2\n"
+ "add x22, x17, x17\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x24\n"
+ "cbz x23, 4f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x24, x23, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x12, x11]\n"
+ "ld1 { v10.4s }, [x16]\n"
+ "ldr q11, [x16, x26]\n"
+ "ld1 { v12.4s }, [x27]\n"
+ "ldr q13, [x13, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "add x24, x24, #0x10\n"
+ "cmp x24, x23, LSL #4\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
"add x20, x20, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "add x23, x23, #0x10\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"add x21, x21, #0x10\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
- "cmp x21, x19, LSL #4\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "ldr q16, [x17, #0x0]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x11, x27]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x11, x16]\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x12, x9]\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x12, x8]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
- "ldr q13, [x15, x16]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x15, x27]\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x16, x8]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x27, x26]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "ldr q16, [x14, #0x0]\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x16, x9]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
- "ldr q13, [x12, x26]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x13]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
"ld1 { v12.4s }, [x10]\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr q10, [x10, x28]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x13, x26]\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "ldr q10, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
"ldr q11, [x10, x26]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "ldr q13, [x9, x16]\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr q12, [x12, x16]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "ldr q13, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x13, x8]\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v26.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
- "ldr q13, [x9, x27]\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
- "ldr q11, [x12, x27]\n"
- "add x12, x12, #0x10\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x27, x9]\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x16]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v5.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q11, [x15, x28]\n"
- "add x15, x15, #0x10\n"
- "fmla v24.4s, v8.4s, v13.4s\n"
- "ld1 { v10.4s }, [x15]\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x27]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x13, x9]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x10, x9]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x10, x8]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
"add x10, x10, #0x10\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x11]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "add x16, x16, #0x10\n"
+ "ld1 { v10.4s }, [x16]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "ld1 { v12.4s }, [x11]\n"
+ "ld1 { v12.4s }, [x12]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x26]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x12, x12, #0x10\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "ldr q0, [x14, #0x10]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "ldr q1, [x17, #0x20]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x26]\n"
- "add x11, x11, #0x10\n"
+ "ldr q2, [x14, #0x30]\n"
"fmla v27.4s, v8.4s, v13.4s\n"
- "ldr q9, [x11, x28]\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
- "ldr q13, [x9, x28]\n"
- "add x9, x9, #0x10\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr q4, [x17, #0x50]\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "ldr q3, [x17, #0x40]\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
- "ld1 { v12.4s }, [x9]\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q0, [x17, #0x10]\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "ldr q5, [x17, #0x60]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q11, [x15, x26]\n"
- "fmla v25.4s, v8.4s, v13.4s\n"
- "ldr q2, [x17, #0x30]\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
- "ldr q7, [x17, #0x80]\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "ldr q8, [x17, #0x90]\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "ldr q13, [x12, x28]\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "ldr q6, [x17, #0x70]\n"
- "add x17, x17, #0xa0\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "st1 { v31.4s }, [x13]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q30, [x13, x14]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "str q29, [x13, x22]\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "st1 { v28.4s }, [x25]\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
+ "ldr q13, [x27, x11]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x16, x26]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "ldr q7, [x14, #0x80]\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x13, x11]\n"
+ "ldr q6, [x14, #0x70]\n"
"fmax v25.4s, v25.4s, v18.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "str q27, [x25, x14]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "add x27, x27, #0x10\n"
+ "ld1 { v12.4s }, [x27]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "add x14, x14, #0xa0\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "st1 { v23.4s }, [x15]\n"
"fmin v25.4s, v25.4s, v17.4s\n"
- "str q26, [x25, x22]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "str q24, [x15, x17]\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q25, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "st1 { v26.4s }, [x28]\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q27, [x28, x17]\n"
+ "str q28, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v29.4s }, [x25]\n"
+ "str q30, [x25, x17]\n"
+ "str q31, [x25, x22]\n"
"add x25, x25, #0x10\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "st1 { v25.4s }, [x24]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "str q24, [x24, x14]\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x24, x22]\n"
- "add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x11, x27]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x11, x16]\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x26]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x12, x9]\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x12, x8]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
- "ldr q13, [x15, x16]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x15, x27]\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x16, x8]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x27, x26]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x16, x9]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
- "ldr q13, [x12, x26]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x13]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
"ld1 { v12.4s }, [x10]\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr q10, [x10, x28]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x13, x26]\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "ldr q10, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
"ldr q11, [x10, x26]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "ldr q13, [x9, x16]\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr q12, [x12, x16]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "ldr q13, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x13, x8]\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v26.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
- "ldr q13, [x9, x27]\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
- "ldr q11, [x12, x27]\n"
- "add x12, x12, #0x10\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x27, x9]\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x10, x16]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v5.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q11, [x15, x28]\n"
- "add x15, x15, #0x10\n"
- "fmla v24.4s, v8.4s, v13.4s\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
- "ldr q13, [x10, x27]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x13, x9]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x10, x9]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x10, x8]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
"add x10, x10, #0x10\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x11]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "ld1 { v12.4s }, [x11]\n"
+ "ld1 { v12.4s }, [x12]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x26]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "add x12, x12, #0x10\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x11, x26]\n"
- "add x11, x11, #0x10\n"
"fmla v27.4s, v8.4s, v13.4s\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
- "ldr q13, [x9, x28]\n"
- "add x9, x9, #0x10\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v25.4s, v8.4s, v13.4s\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "st1 { v31.4s }, [x13]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "str q30, [x13, x14]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "str q29, [x13, x22]\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
+ "ldr q13, [x27, x11]\n"
"fmax v27.4s, v27.4s, v18.4s\n"
- "st1 { v28.4s }, [x25]\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
"fmax v25.4s, v25.4s, v18.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "str q27, [x25, x14]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "st1 { v23.4s }, [x15]\n"
"fmin v25.4s, v25.4s, v17.4s\n"
- "str q26, [x25, x22]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "str q24, [x15, x17]\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q25, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "st1 { v26.4s }, [x28]\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q27, [x28, x17]\n"
+ "str q28, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v29.4s }, [x25]\n"
+ "str q30, [x25, x17]\n"
+ "str q31, [x25, x22]\n"
"add x25, x25, #0x10\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "st1 { v25.4s }, [x24]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "str q24, [x24, x14]\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x24, x22]\n"
- "add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 49f\n"
- "ldr q16, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
- "add x23, x11, x28\n"
- "ldr q1, [x17, #0x20]\n"
- "add x22, x15, XZR\n"
- "ldr q2, [x17, #0x30]\n"
- "add x21, x15, x26\n"
- "ldr q3, [x17, #0x40]\n"
- "add x20, x9, XZR\n"
- "ldr q4, [x17, #0x50]\n"
- "add x19, x12, x28\n"
- "ldr q5, [x17, #0x60]\n"
- "ldr q6, [x17, #0x70]\n"
- "ldr q7, [x17, #0x80]\n"
- "ldr q8, [x17, #0x90]\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x24, x12, x11\n"
+ "add x23, x16, XZR\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x22, x16, x26\n"
+ "add x21, x27, XZR\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x20, x13, x11\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
- "ldr d9, [x23], #0x8\n"
- "ldr d10, [x22], #0x8\n"
- "ldr d11, [x21], #0x8\n"
- "ldr d12, [x20], #0x8\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v9.s }[2], [x23]\n"
- "ld1 { v10.s }[2], [x22]\n"
- "ld1 { v11.s }[2], [x21]\n"
- "ld1 { v12.s }[2], [x20]\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x24]\n"
+ "ld1 { v10.s }[2], [x23]\n"
+ "ld1 { v11.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x21]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 6f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
- "ldr s9, [x23, #0x0]\n"
- "ldr s10, [x22, #0x0]\n"
- "ldr s11, [x21, #0x0]\n"
- "ldr s12, [x20, #0x0]\n"
- "ldr s13, [x19, #0x0]\n"
+ "ldr s9, [x24, #0x0]\n"
+ "ldr s10, [x23, #0x0]\n"
+ "ldr s11, [x22, #0x0]\n"
+ "ldr s12, [x21, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "add x19, x9, x26\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "add x20, x27, x26\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
"tbz %x[n_channels], #1, 7f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
- "fmla v23.4s, v8.4s, v12.4s\n"
- "add x19, x11, x16\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "add x20, x12, x8\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 10f\n"
"9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v11.4s\n"
- "add x19, x15, x16\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "add x20, x16, x8\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 11f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v13.4s\n"
- "add x19, x15, x27\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "add x20, x16, x9\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 14f\n"
"13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
- "fmla v30.4s, v2.4s, v12.4s\n"
- "add x19, x11, x27\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "add x20, x12, x9\n"
"tbz %x[n_channels], #1, 15f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
- "fmla v30.4s, v8.4s, v10.4s\n"
- "add x19, x12, XZR\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "add x20, x13, XZR\n"
"fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
"tbz %x[n_channels], #1, 17f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 18f\n"
"17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v11.4s\n"
- "add x19, x12, x26\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "add x20, x13, x26\n"
"tbz %x[n_channels], #1, 19f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
- "fmla v29.4s, v5.4s, v13.4s\n"
- "add x19, x10, XZR\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #1, 21f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 22f\n"
"21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
- "fmla v28.4s, v6.4s, v12.4s\n"
- "add x19, x10, x28\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #1, 23f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "fmla v28.4s, v8.4s, v10.4s\n"
- "add x19, x10, x26\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
+ "add x20, x10, x26\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
"tbz %x[n_channels], #1, 25f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 26f\n"
"25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
- "fmla v26.4s, v8.4s, v11.4s\n"
- "add x19, x9, x16\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x27, x8\n"
"tbz %x[n_channels], #1, 27f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
- "fmla v25.4s, v7.4s, v13.4s\n"
- "add x19, x12, x16\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "add x20, x13, x8\n"
"tbz %x[n_channels], #1, 29f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 30f\n"
"29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
- "fmla v31.4s, v4.4s, v12.4s\n"
- "add x19, x12, x27\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x13, x9\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 31f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v11.4s\n"
- "add x19, x9, x27\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "add x20, x27, x9\n"
"fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 33f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 34f\n"
"33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
- "fmla v24.4s, v8.4s, v13.4s\n"
- "add x19, x10, x16\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "add x20, x10, x8\n"
"tbz %x[n_channels], #1, 35f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "fmla v28.4s, v7.4s, v12.4s\n"
- "add x19, x15, x28\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x16, x11\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 37f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 38f\n"
"37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v11.4s\n"
- "add x19, x10, x27\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "add x20, x10, x9\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 39f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
"fmla v27.4s, v8.4s, v13.4s\n"
- "add x19, x11, XZR\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x12, XZR\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 41f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 42f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 42f\n"
"41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v12.4s\n"
- "add x19, x11, x26\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "add x20, x12, x26\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 43f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
- "fmla v29.4s, v8.4s, v11.4s\n"
- "add x19, x9, x28\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "add x20, x27, x11\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"tbz %x[n_channels], #1, 45f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 46f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 46f\n"
"45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
- "fmla v25.4s, v8.4s, v13.4s\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
"fmax v23.4s, v23.4s, v18.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v23.4s, v23.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 47f\n"
- "mov x19, x13\n"
- "st1 { v31.d }[0], [x19], x14\n"
- "add x13, x13, #0x8\n"
- "st1 { v30.d }[0], [x19], x14\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.d }[0], [x22], x17\n"
"mov x20, x25\n"
- "st1 { v29.d }[0], [x19]\n"
- "st1 { v28.d }[0], [x20], x14\n"
+ "st1 { v26.d }[0], [x21], x17\n"
+ "add x15, x15, #0x8\n"
+ "st1 { v29.d }[0], [x20], x17\n"
+ "add x28, x28, #0x8\n"
"add x25, x25, #0x8\n"
- "st1 { v27.d }[0], [x20], x14\n"
- "mov x19, x24\n"
- "st1 { v26.d }[0], [x20]\n"
- "add x24, x24, #0x8\n"
- "st1 { v25.d }[0], [x19], x14\n"
- "st1 { v24.d }[0], [x19], x14\n"
- "st1 { v23.d }[0], [x19]\n"
+ "st1 { v24.d }[0], [x22], x17\n"
+ "st1 { v27.d }[0], [x21], x17\n"
+ "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "st1 { v28.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 48f\n"
- "mov x21, x13\n"
- "st1 { v31.s }[2], [x21], x14\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[2], [x22], x17\n"
"mov x20, x25\n"
- "st1 { v30.s }[2], [x21], x14\n"
- "st1 { v28.s }[2], [x20], x14\n"
- "mov x19, x24\n"
- "st1 { v29.s }[2], [x21]\n"
- "st1 { v27.s }[2], [x20], x14\n"
- "st1 { v26.s }[2], [x20]\n"
- "st1 { v25.s }[2], [x19], x14\n"
- "st1 { v24.s }[2], [x19], x14\n"
- "st1 { v23.s }[2], [x19]\n"
+ "st1 { v26.s }[2], [x21], x17\n"
+ "st1 { v29.s }[2], [x20], x17\n"
+ "st1 { v24.s }[2], [x22], x17\n"
+ "st1 { v27.s }[2], [x21], x17\n"
+ "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x21, x13\n"
- "st1 { v31.s }[0], [x21], x14\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[0], [x22], x17\n"
"mov x20, x25\n"
- "mov x19, x24\n"
- "st1 { v30.s }[0], [x21], x14\n"
- "st1 { v28.s }[0], [x20], x14\n"
- "st1 { v29.s }[0], [x21]\n"
- "st1 { v27.s }[0], [x20], x14\n"
- "st1 { v26.s }[0], [x20]\n"
- "st1 { v25.s }[0], [x19], x14\n"
- "st1 { v24.s }[0], [x19], x14\n"
- "st1 { v23.s }[0], [x19]\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "st1 { v29.s }[0], [x20], x17\n"
+ "st1 { v24.s }[0], [x22], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "st1 { v28.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"48:" // Tile loop: Oddments: Store: Bit 1: End
"49:" // Tile loop: End
- "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x7, #0x1\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x23, x23, #0x1\n"
+ "add x21, x24, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x23, x20\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "add x8, x8, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x8, x19\n"
- "csel x8, x8, XZR, LT\n"
- "csel x7, x7, x21, LT\n"
- "cmp x7, x20\n"
+ "csel x24, x24, x21, LT\n"
+ "csel x23, x23, XZR, LT\n"
+ "cmp x24, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index aa7d35e3e1..15053a337a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,21 +87,21 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x2\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.4s }, [x20]\n"
- "ld1r { v17.4s }, [x19]\n"
- "mov x14, #0x0\n"
- "mov x13, #0x10\n" // cntb _, ALL, #1
- "sub x12, XZR, x13\n"
- "lsr x11, %x[n_channels], #0x2\n"
- "cbz x11, 3f\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "sub x12, XZR, x8\n"
+ "cbz x17, 3f\n"
"ldr q16, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
- "cmp x13, x11, LSL #4\n"
+ "cmp x8, x17, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
"ldr q2, [x15, #0x30]\n"
"ldr q3, [x15, #0x40]\n"
@@ -111,363 +111,363 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
"add x15, x15, #0xa0\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "ldp x28, x27, [x16, #0x10]\n"
- "ldr x26, [x16, #0x20]\n"
- "ldr q9, [x10, x14]\n"
- "ldr q10, [x9, x14]\n"
- "ldr q11, [x28, x14]\n"
- "ldr q12, [x27, x14]\n"
- "ldr q13, [x26, x14]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x13]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr q13, [x27, x13]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "add x12, x12, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
- "ldr x10, [x16, #0x40]\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x14, #0x30]\n"
+ "ldr x25, [x14, #0x38]\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr x24, [x14, #0x28]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "ldr q10, [x10, x13]\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x11, [x14, #0x40]\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "ldr x22, [x17, #0x0]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q12, [x25, x14]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "ldr x24, [x16, #0x70]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr x21, [x17, #0x8]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr x27, [x14, #0x60]\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "ldr x26, [x14, #0x70]\n"
+ "ldr x10, [x14, #0x88]\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x20, [x17, #0x10]\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
- "ldr q13, [x23, x14]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x10, x14]\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "ldr x19, [x17, #0x18]\n"
- "fmla v27.4s, v3.4s, v11.4s\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x25, x13]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"ldr q16, [x15, #0x0]\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
- "ldr q11, [x28, x14]\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
- "ldr q13, [x27, x14]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x27, [x16, #0x98]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x14, #0x68]\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x11, x13]\n"
+ "ldr x25, [x14, #0x78]\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "ldr x11, [x14, #0x80]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x9, x13]\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x28, x13]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "ldr q13, [x23, x14]\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr q12, [x10, x14]\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "ldr x28, [x14, #0x98]\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x24, x13]\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr x24, [x14, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x11, x13]\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v26.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
- "ldr q13, [x28, x14]\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
- "ldr q11, [x9, x14]\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr x11, [x14, #0xc0]\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "ldr q13, [x25, x13]\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "ldr x25, [x14, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v5.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x10, x13]\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x9, x13]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "fmla v24.4s, v8.4s, v13.4s\n"
- "ldr x26, [x16, #0x20]\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
- "ldr q13, [x25, x14]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "ldr x27, [x14, #0x20]\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x24, x13]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "ldr q12, [x24, x14]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x26, x13]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x23, x14]\n"
- "fmla v27.4s, v8.4s, v13.4s\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x14]\n"
- "add x14, x14, #0x10\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "ldp x28, x27, [x16, #0x10]\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q23, [x23, x12]\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr q9, [x10, x13]\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "ldr q10, [x9, x13]\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v25.4s, v8.4s, v13.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"ldr q2, [x15, #0x30]\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "ldr q13, [x11, x13]\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
"ldr q3, [x15, #0x40]\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "ldr q13, [x26, x13]\n"
- "add x13, x13, #0x10\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "ldr q4, [x15, #0x50]\n"
- "cmp x13, x11, LSL #4\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
"ldr q5, [x15, #0x60]\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x27, x8]\n"
"ldr q6, [x15, #0x70]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "str q31, [x22, x12]\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
"fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x22, [x17, #0x20]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "ldr q7, [x15, #0x80]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x8]\n"
+ "ldr q10, [x10, x8]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x8]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "ldr q12, [x28, x8]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q24, [x22, x12]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "str q25, [x21, x12]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "str q26, [x20, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "add x8, x8, #0x10\n"
+ "str q27, [x23, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "cmp x8, x17, LSL #4\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v17.4s\n"
- "ldr q8, [x15, #0x90]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "add x13, x13, #0x10\n"
+ "str q28, [x22, x12]\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q29, [x21, x12]\n"
"add x15, x15, #0xa0\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q30, [x21, x12]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "str q29, [x20, x12]\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "ldr x21, [x17, #0x28]\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "str q28, [x19, x12]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "str q27, [x22, x12]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "ldr x20, [x17, #0x30]\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "ldr x19, [x17, #0x38]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "str q26, [x21, x12]\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "str q25, [x20, x12]\n"
- "ldr x22, [x17, #0x40]\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q24, [x19, x12]\n"
- "str q23, [x22, x12]\n"
+ "str q30, [x20, x12]\n"
+ "str q31, [x23, x12]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "add x12, x12, #0x10\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
- "ldr x10, [x16, #0x40]\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x14, #0x30]\n"
+ "ldr x25, [x14, #0x38]\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr x24, [x14, #0x28]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "ldr q10, [x10, x13]\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x11, [x14, #0x40]\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "ldr x22, [x17, #0x0]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "ldr q12, [x25, x14]\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "ldr x24, [x16, #0x70]\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr x21, [x17, #0x8]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr x27, [x14, #0x60]\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "ldr x26, [x14, #0x70]\n"
+ "ldr x10, [x14, #0x88]\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "ldr x20, [x17, #0x10]\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
- "ldr q13, [x23, x14]\n"
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr q12, [x10, x14]\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "ldr x19, [x17, #0x18]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x25, x13]\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x14, #0x68]\n"
+ "ldr x25, [x14, #0x78]\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x11, x13]\n"
+ "ldr x11, [x14, #0x80]\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
- "ldr q11, [x28, x14]\n"
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
- "ldr q13, [x27, x14]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ldr x27, [x16, #0x98]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x9, x13]\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x28, x13]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
"fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "ldr q13, [x23, x14]\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr q12, [x10, x14]\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "ldr x28, [x14, #0x98]\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x24, x13]\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr x24, [x14, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x11, x13]\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v28.4s, v8.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v26.4s, v8.4s, v11.4s\n"
- "fmla v25.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
- "ldr q13, [x28, x14]\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
- "ldr q11, [x9, x14]\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr x11, [x14, #0xc0]\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "ldr q13, [x25, x13]\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "ldr x25, [x14, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v5.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x10, x13]\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x9, x13]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "fmla v24.4s, v8.4s, v13.4s\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
- "ldr q13, [x25, x14]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x24, x13]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "ldr q12, [x24, x14]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x26, x13]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q23, [x23, x12]\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q11, [x23, x14]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
"fmla v27.4s, v8.4s, v13.4s\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
- "ldr q13, [x10, x14]\n"
- "add x14, x14, #0x10\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
- "fmla v29.4s, v8.4s, v11.4s\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v25.4s, v8.4s, v13.4s\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "str q31, [x22, x12]\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "ldr x22, [x17, #0x20]\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "str q30, [x21, x12]\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "ldr x21, [x17, #0x28]\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "str q29, [x20, x12]\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
+ "ldr q13, [x11, x13]\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
"fmax v26.4s, v26.4s, v18.4s\n"
- "str q28, [x19, x12]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
"fmax v25.4s, v25.4s, v18.4s\n"
- "ldr x20, [x17, #0x30]\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
- "str q27, [x22, x12]\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "ldr x19, [x17, #0x38]\n"
+ "str q24, [x22, x12]\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "ldr x22, [x16, #0x28]\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
"fmin v25.4s, v25.4s, v17.4s\n"
- "ldr x22, [x17, #0x40]\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
- "str q26, [x21, x12]\n"
- "fmax v23.4s, v23.4s, v18.4s\n"
- "str q25, [x20, x12]\n"
- "str q24, [x19, x12]\n"
- "fmin v23.4s, v23.4s, v17.4s\n"
- "str q23, [x22, x12]\n"
+ "str q25, [x21, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "str q26, [x20, x12]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q27, [x23, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q28, [x22, x12]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q29, [x21, x12]\n"
+ "str q30, [x20, x12]\n"
+ "str q31, [x23, x12]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 48f\n"
"ldr q16, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
- "mov x12, x14\n"
+ "mov x12, x13\n"
"ldr q1, [x15, #0x20]\n"
"ldr q2, [x15, #0x30]\n"
"ldr q3, [x15, #0x40]\n"
@@ -476,428 +476,426 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q6, [x15, #0x70]\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "ldr x10, [x16, #0x0]\n"
- "add x10, x10, x14\n"
- "ldr x9, [x16, #0x8]\n"
- "ldr x28, [x16, #0x10]\n"
- "add x9, x9, x14\n"
- "ldr x27, [x16, #0x18]\n"
- "ldr x26, [x16, #0x20]\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "add x26, x26, x14\n"
+ "ldr x24, [x14, #0x0]\n"
+ "ldr x23, [x14, #0x8]\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "ldr x22, [x14, #0x10]\n"
+ "ldr x21, [x14, #0x18]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.d }[0], [x10], #0x8\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
- "ld1 { v13.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 5f\n"
- "ld1 { v9.s }[2], [x10], #0x4\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
- "ld1 { v13.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 5f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
- "ld1 { v9.s }[0], [x10], #0x4\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
- "ld1 { v13.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
- "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "add x25, x25, x14\n"
- "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x14, #0x28]\n"
+ "add x20, x20, x13\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
- "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
- "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
- "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v25.4s, v6.4s, v12.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
- "fmla v26.4s, v0.4s, v13.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v12.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load input (4, 4): Bit 1: Unset
- "ld1 { v12.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (4, 4): Bit 1: End
- "fmla v23.4s, v8.4s, v12.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "add x24, x24, x14\n"
+ "ldr x20, [x14, #0x30]\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 9f\n"
"8:" // Oddments: Load input (2, 1): Bit 1: Unset
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v11.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "fmla v30.4s, v6.4s, v11.4s\n"
- "add x23, x23, x14\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Load input (0, 1): Bit 1: Unset
- "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v13.4s\n"
- "ldr x10, [x16, #0x40]\n"
- "fmla v30.4s, v0.4s, v13.4s\n"
- "add x10, x10, x14\n"
+ "ldr x20, [x14, #0x40]\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v12.s }[2], [x10], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 13f\n"
"12:" // Oddments: Load input (0, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x10], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 3): Bit 1: End
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "add x9, x9, x14\n"
+ "ldr x20, [x14, #0x48]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 15f\n"
"14:" // Oddments: Load input (2, 3): Bit 1: Unset
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 3): Bit 1: End
- "fmla v30.4s, v8.4s, v10.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "fmla v29.4s, v7.4s, v10.4s\n"
- "add x28, x28, x14\n"
+ "ldr x20, [x14, #0x50]\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x13\n"
"fmla v27.4s, v5.4s, v10.4s\n"
- "fmla v26.4s, v4.4s, v10.4s\n"
- "fmla v24.4s, v2.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: Load input (1, 0): Bit 1: Unset
- "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v11.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "fmla v28.4s, v0.4s, v11.4s\n"
- "add x27, x27, x14\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v13.d }[0], [x27], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.s }[2], [x27], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 19f\n"
"18:" // Oddments: Load input (1, 4): Bit 1: Unset
- "ld1 { v13.s }[0], [x27], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 4): Bit 1: End
- "fmla v29.4s, v5.4s, v13.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "add x26, x26, x14\n"
+ "ldr x20, [x14, #0x60]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 21f\n"
"20:" // Oddments: Load input (3, 0): Bit 1: Unset
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (3, 0): Bit 1: End
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v25.4s, v3.4s, v12.4s\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x14, #0x68]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 23f\n"
"22:" // Oddments: Load input (3, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (3, 2): Bit 1: End
- "fmla v28.4s, v8.4s, v10.4s\n"
- "ldr x24, [x16, #0x70]\n"
+ "ldr x20, [x14, #0x70]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "add x24, x24, x14\n"
- "fmla v26.4s, v6.4s, v10.4s\n"
- "fmla v25.4s, v5.4s, v10.4s\n"
- "fmla v24.4s, v4.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 25f\n"
"24:" // Oddments: Load input (3, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 4): Bit 1: End
- "fmla v26.4s, v8.4s, v11.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v23.4s, v5.4s, v11.4s\n"
- "add x23, x23, x14\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 27f\n"
"26:" // Oddments: Load input (4, 1): Bit 1: Unset
- "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 1): Bit 1: End
- "fmla v25.4s, v7.4s, v13.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v24.4s, v6.4s, v13.4s\n"
- "add x10, x10, x14\n"
+ "ldr x20, [x14, #0x80]\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
- "ld1 { v12.s }[2], [x10], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 29f\n"
"28:" // Oddments: Load input (1, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x10], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 1): Bit 1: End
- "fmla v31.4s, v4.4s, v12.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v30.4s, v3.4s, v12.4s\n"
- "add x9, x9, x14\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x14, #0x88]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v11.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 31f\n"
"30:" // Oddments: Load input (1, 3): Bit 1: Unset
- "ld1 { v11.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v11.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "add x28, x28, x14\n"
+ "ldr x20, [x14, #0x90]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"fmla v27.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v13.d }[0], [x28], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v13.s }[2], [x28], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 33f\n"
"32:" // Oddments: Load input (4, 3): Bit 1: Unset
- "ld1 { v13.s }[0], [x28], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (4, 3): Bit 1: End
- "fmla v24.4s, v8.4s, v13.4s\n"
- "ldr x27, [x16, #0x98]\n"
- "fmla v23.4s, v7.4s, v13.4s\n"
- "add x27, x27, x14\n"
+ "ldr x20, [x14, #0x98]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 35f\n"
"34:" // Oddments: Load input (3, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (3, 1): Bit 1: End
- "fmla v28.4s, v7.4s, v12.4s\n"
- "ldr x26, [x16, #0xa0]\n"
+ "ldr x20, [x14, #0xa0]\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "add x26, x26, x14\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 37f\n"
"36:" // Oddments: Load input (0, 2): Bit 1: Unset
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v11.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v30.4s, v1.4s, v11.4s\n"
- "add x25, x25, x14\n"
- "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr x20, [x14, #0xa8]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v13.d }[0], [x25], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 39f\n"
"38:" // Oddments: Load input (3, 3): Bit 1: Unset
- "ld1 { v13.s }[0], [x25], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr x20, [x14, #0xb0]\n"
"fmla v27.4s, v8.4s, v13.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v26.4s, v7.4s, v13.4s\n"
- "add x24, x24, x14\n"
- "fmla v24.4s, v5.4s, v13.4s\n"
- "fmla v23.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
- "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 41f\n"
"40:" // Oddments: Load input (2, 0): Bit 1: Unset
- "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v12.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "add x23, x23, x14\n"
- "fmla v25.4s, v0.4s, v12.4s\n"
+ "ldr x20, [x14, #0xb8]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 43f\n"
"42:" // Oddments: Load input (2, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (2, 4): Bit 1: End
- "fmla v29.4s, v8.4s, v11.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v26.4s, v5.4s, v11.4s\n"
- "add x10, x10, x14\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr x20, [x14, #0xc0]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v13.d }[0], [x10], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
- "ld1 { v13.s }[2], [x10], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 45f\n"
"44:" // Oddments: Load input (4, 2): Bit 1: Unset
- "ld1 { v13.s }[0], [x10], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"45:" // Oddments: Load input (4, 2): Bit 1: End
- "fmla v25.4s, v8.4s, v13.4s\n"
- "fmla v24.4s, v7.4s, v13.4s\n"
- "fmla v23.4s, v6.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v18.4s\n"
- "fmax v30.4s, v30.4s, v18.4s\n"
- "fmax v29.4s, v29.4s, v18.4s\n"
- "fmin v31.4s, v31.4s, v17.4s\n"
- "fmin v30.4s, v30.4s, v17.4s\n"
- "fmin v29.4s, v29.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "fmax v27.4s, v27.4s, v18.4s\n"
- "fmax v26.4s, v26.4s, v18.4s\n"
- "fmin v28.4s, v28.4s, v17.4s\n"
- "fmin v27.4s, v27.4s, v17.4s\n"
- "fmin v26.4s, v26.4s, v17.4s\n"
- "fmax v25.4s, v25.4s, v18.4s\n"
- "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
"fmax v23.4s, v23.4s, v18.4s\n"
- "fmin v25.4s, v25.4s, v17.4s\n"
- "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
"fmin v23.4s, v23.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 46f\n"
- "ldr x22, [x17, #0x0]\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.d }[0], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x22, x22, x12\n"
- "ldr x20, [x17, #0x10]\n"
- "ldr x19, [x17, #0x18]\n"
"add x21, x21, x12\n"
- "st1 { v31.d }[0], [x22]\n"
+ "ldr x23, [x16, #0x20]\n"
"add x20, x20, x12\n"
- "st1 { v30.d }[0], [x21]\n"
- "ldr x22, [x17, #0x20]\n"
- "add x19, x19, x12\n"
- "st1 { v29.d }[0], [x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.d }[0], [x22]\n"
+ "st1 { v25.d }[0], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x22, x22, x12\n"
- "st1 { v28.d }[0], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v26.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x21, x21, x12\n"
- "st1 { v27.d }[0], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
"add x20, x20, x12\n"
- "st1 { v26.d }[0], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.d }[0], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
- "add x22, x22, x12\n"
- "st1 { v24.d }[0], [x19]\n"
+ "st1 { v27.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
"add x12, x12, #0x8\n"
- "st1 { v23.d }[0], [x22]\n"
+ "st1 { v28.d }[0], [x22]\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v30.d }[0], [x20]\n"
+ "st1 { v31.d }[0], [x23]\n"
"tbz %x[n_channels], #0, 47f\n"
- "ldr x22, [x17, #0x0]\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x22, x22, x12\n"
- "ldr x20, [x17, #0x10]\n"
- "ldr x19, [x17, #0x18]\n"
"add x21, x21, x12\n"
- "st1 { v31.s }[2], [x22]\n"
+ "ldr x23, [x16, #0x20]\n"
"add x20, x20, x12\n"
- "st1 { v30.s }[2], [x21]\n"
- "ldr x22, [x17, #0x20]\n"
- "add x19, x19, x12\n"
- "st1 { v29.s }[2], [x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x22, x22, x12\n"
- "st1 { v28.s }[2], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x21, x21, x12\n"
- "st1 { v27.s }[2], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
"add x20, x20, x12\n"
- "st1 { v26.s }[2], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.s }[2], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
- "add x22, x22, x12\n"
- "st1 { v24.s }[2], [x19]\n"
- "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x23]\n"
"b 47f\n"
"46:" // Oddments: Store: Bit 1: Unset
- "ldr x22, [x17, #0x0]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "add x23, x23, x12\n"
+ "st1 { v23.s }[0], [x23]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x22, x22, x12\n"
- "ldr x21, [x17, #0x8]\n"
- "ldr x20, [x17, #0x10]\n"
"add x21, x21, x12\n"
- "st1 { v31.s }[0], [x22]\n"
- "ldr x19, [x17, #0x18]\n"
+ "ldr x23, [x16, #0x20]\n"
"add x20, x20, x12\n"
- "st1 { v30.s }[0], [x21]\n"
- "add x19, x19, x12\n"
- "st1 { v29.s }[0], [x20]\n"
- "ldr x22, [x17, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v24.s }[0], [x22]\n"
+ "st1 { v25.s }[0], [x21]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x22, x22, x12\n"
- "st1 { v28.s }[0], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v26.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x21, x21, x12\n"
- "st1 { v27.s }[0], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
"add x20, x20, x12\n"
- "st1 { v26.s }[0], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.s }[0], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
- "add x22, x22, x12\n"
- "st1 { v24.s }[0], [x19]\n"
- "st1 { v23.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v28.s }[0], [x22]\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v30.s }[0], [x20]\n"
+ "st1 { v31.s }[0], [x23]\n"
"47:" // Oddments: Store: Bit 1: End
-
"48:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 6faacf144a..6d2b6ee998 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,1143 +87,1142 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x4, #0x0\n"
+ "mov x27, #0x0\n"
"mov x26, #0x0\n"
"1:" // Tile loop
- "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x25, #0x4\n"
+ "mov x23, #0x4\n"
"str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x24, #0x4\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x23, %x[params_struct], %[offsetof_args_min]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "add x21, %x[params_struct], %[offsetof_args_max]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mov x7, #0x0\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
- "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1r { v15.4s }, [x23]\n"
- "add x15, x8, x22, LSL #2\n"
- "ld1r { v14.4s }, [x21]\n"
- "add x14, x15, x22, LSL #2\n"
- "lsl x6, x6, #0x2\n"
- "add x13, x14, x22, LSL #2\n"
- "add x12, x13, x22, LSL #2\n"
- "add x11, x12, x22, LSL #2\n"
- "add x10, x6, x6\n"
- "add x9, x10, x6\n"
- "add x28, x9, x6\n"
- "add x27, x28, x6\n"
- "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
- "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x24\n" // offset *= output_tile_size
- "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x26, x16, x20, LSL #2\n"
- "add x25, x26, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "lsl x17, x17, #0x2\n"
- "add x23, x17, x17\n"
- "add x22, x23, x17\n"
- "mov x21, #0x10\n" // cntb _, ALL, #1
- "sub x20, XZR, x21\n"
- "lsr x19, %x[n_channels], #0x2\n"
- "cbz x19, 4f\n"
- "ldr q13, [x5, #0x0]\n"
- "ldr q0, [x5, #0x10]\n"
- "cmp x21, x19, LSL #4\n"
- "ldr q1, [x5, #0x20]\n"
- "ldr q2, [x5, #0x30]\n"
- "ldr q3, [x5, #0x40]\n"
- "ldr q4, [x5, #0x50]\n"
- "ldr q5, [x5, #0x60]\n"
- "ldr q6, [x5, #0x70]\n"
- "ldr q7, [x5, #0x80]\n"
- "ldr q8, [x5, #0x90]\n"
- "add x5, x5, #0xa0\n"
- "ldr q9, [x14, x10]\n"
- "ld1 { v10.4s }, [x8]\n"
- "ldr q11, [x8, x27]\n"
- "ldr q12, [x14, x9]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x4, x4, #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
+ "lsl x5, x5, #0x2\n"
+ "add x17, x4, x4\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x7, x7, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x15, x7, x24, LSL #2\n"
+ "mul x20, x20, x23\n" // offset *= output_tile_size
+ "add x14, x15, x24, LSL #2\n"
+ "add x8, x8, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "lsr x13, %x[n_channels], #0x2\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x17, x4\n"
+ "add x10, x8, x22, LSL #2\n"
+ "add x9, x12, x24, LSL #2\n"
+ "add x28, x11, x4\n"
+ "add x27, x10, x22, LSL #2\n"
+ "add x23, x5, x5\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x26, x9, x24, LSL #2\n"
+ "add x25, x28, x4\n"
+ "add x24, x27, x22, LSL #2\n"
+ "add x22, x23, x5\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x6\n"
+ "cbz x13, 4f\n"
+ "ldr q13, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x6, x13, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x14, x17]\n"
+ "ld1 { v10.4s }, [x7]\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q12, [x14, x11]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x13, LSL #4\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
"add x20, x20, #0x10\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "add x7, x7, #0x10\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"add x21, x21, #0x10\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "cmp x21, x19, LSL #4\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q9, [x13, x10]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x12, x17]\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x26]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x25]\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x12, x11]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x7, x4]\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x7, x28]\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x15, x25]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x9]\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
"fmla v31.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x11]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "ldr q11, [x11, x27]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
- "ldr q12, [x8, x6]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "ldr q10, [x13, x9]\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "ldr q11, [x8, x28]\n"
- "fmla v27.4s, v8.4s, v9.4s\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x9, x25]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x15, x11]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"ld1 { v9.4s }, [x15]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q13, [x5, #0x0]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x27]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x15, x10]\n"
- "fmla v31.4s, v3.4s, v9.4s\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x9]\n"
- "fmla v23.4s, v6.4s, v11.4s\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
- "ldr q11, [x12, x27]\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x14, x6]\n"
- "fmla v20.4s, v8.4s, v11.4s\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "ldr q11, [x11, x6]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v19.4s, v7.4s, v11.4s\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
- "ldr q11, [x11, x28]\n"
- "fmla v31.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "ldr q11, [x26, x4]\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
"fmla v22.4s, v0.4s, v10.4s\n"
- "ldr q10, [x8, x10]\n"
- "fmla v17.4s, v8.4s, v11.4s\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
- "ldr q11, [x13, x6]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
- "ldr q12, [x8, x9]\n"
- "add x8, x8, #0x10\n"
- "fmla v31.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "ldr q10, [x14, x4]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x26, x28]\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x7, x17]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x12, x4]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x28]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x14]\n"
- "fmla v27.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
- "ldr q11, [x13, x28]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x27]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x9, x17]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v31.4s, v6.4s, v10.4s\n"
- "ldr q9, [x14, x10]\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x13]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x12]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x17]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x9, x11]\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x26, x11]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x15, x4]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "ldr q11, [x12, x10]\n"
- "fmla v28.4s, v8.4s, v12.4s\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
- "ldr q12, [x13, x27]\n"
- "add x13, x13, #0x10\n"
- "fmla v27.4s, v6.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
- "ldr q10, [x11, x10]\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
- "ldr q11, [x12, x9]\n"
- "fmla v24.4s, v8.4s, v12.4s\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
- "ldr q12, [x11, x9]\n"
- "add x11, x11, #0x10\n"
- "fmla v19.4s, v8.4s, v10.4s\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x15, x6]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
"ldr q11, [x15, x28]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
"add x15, x15, #0x10\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
- "ldr q12, [x12, x6]\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x28]\n"
- "add x12, x12, #0x10\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "ldr q0, [x5, #0x10]\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q2, [x5, #0x30]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q11, [x8, x27]\n"
- "fmla v23.4s, v7.4s, v12.4s\n"
- "ldr q1, [x5, #0x20]\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "ldr q6, [x5, #0x70]\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
- "ldr q12, [x14, x9]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "ldr q3, [x5, #0x40]\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
- "ldr q7, [x5, #0x80]\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
- "ldr q5, [x5, #0x60]\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x4]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
- "ld1 { v10.4s }, [x8]\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "ldr q4, [x5, #0x50]\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "ldr q8, [x5, #0x90]\n"
- "add x5, x5, #0xa0\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "st1 { v31.4s }, [x16]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "str q30, [x16, x17]\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "str q29, [x16, x23]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q28, [x16, x22]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "add x16, x16, #0x10\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "st1 { v27.4s }, [x26]\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "str q26, [x26, x17]\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "str q25, [x26, x23]\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "str q24, [x26, x22]\n"
- "add x26, x26, #0x10\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "st1 { v23.4s }, [x25]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q22, [x25, x17]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "str q21, [x25, x23]\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "str q20, [x25, x22]\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "add x25, x25, #0x10\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "st1 { v19.4s }, [x24]\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
"fmax v16.4s, v16.4s, v15.4s\n"
- "str q18, [x24, x17]\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "str q17, [x24, x23]\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x28]\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "ldr q13, [x16, #0x0]\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x7]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
- "str q16, [x24, x22]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "st1 { v16.4s }, [x8]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q17, [x8, x5]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q18, [x8, x23]\n"
+ "add x16, x16, #0xa0\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q19, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "st1 { v20.4s }, [x10]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q21, [x10, x5]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q22, [x10, x23]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q23, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v24.4s }, [x27]\n"
+ "str q25, [x27, x5]\n"
+ "str q26, [x27, x23]\n"
+ "str q27, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v28.4s }, [x24]\n"
+ "str q29, [x24, x5]\n"
+ "str q30, [x24, x23]\n"
+ "str q31, [x24, x22]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q9, [x13, x10]\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x12, x17]\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x26]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x25]\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x12, x11]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x7, x4]\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x7, x28]\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x15, x25]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x9]\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
"fmla v31.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x11]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "ldr q11, [x11, x27]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
- "ldr q12, [x8, x6]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "ldr q10, [x13, x9]\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "ldr q11, [x8, x28]\n"
- "fmla v27.4s, v8.4s, v9.4s\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x9, x25]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x15, x11]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"ld1 { v9.4s }, [x15]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x15, x27]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ld1 { v11.4s }, [x12]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x15, x10]\n"
- "fmla v31.4s, v3.4s, v9.4s\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "ldr q12, [x15, x9]\n"
- "fmla v23.4s, v6.4s, v11.4s\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
- "ldr q11, [x12, x27]\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x14, x6]\n"
- "fmla v20.4s, v8.4s, v11.4s\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "ldr q11, [x11, x6]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v19.4s, v7.4s, v11.4s\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
- "ldr q11, [x11, x28]\n"
- "fmla v31.4s, v7.4s, v10.4s\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "ldr q11, [x26, x4]\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
"fmla v22.4s, v0.4s, v10.4s\n"
- "ldr q10, [x8, x10]\n"
- "fmla v17.4s, v8.4s, v11.4s\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
- "ldr q11, [x13, x6]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
- "ldr q12, [x8, x9]\n"
- "add x8, x8, #0x10\n"
- "fmla v31.4s, v2.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "ldr q10, [x14, x4]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x26, x28]\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x7, x17]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x12, x4]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x28]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x14]\n"
- "fmla v27.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
- "ldr q11, [x13, x28]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
- "ldr q12, [x14, x27]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x9, x17]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x25]\n"
"add x14, x14, #0x10\n"
- "fmla v31.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x13]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x12]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x17]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x9, x11]\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x26, x11]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x15, x4]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "ldr q11, [x12, x10]\n"
- "fmla v28.4s, v8.4s, v12.4s\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
- "ldr q12, [x13, x27]\n"
- "add x13, x13, #0x10\n"
- "fmla v27.4s, v6.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
- "ldr q10, [x11, x10]\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
- "ldr q11, [x12, x9]\n"
- "fmla v24.4s, v8.4s, v12.4s\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
- "ldr q12, [x11, x9]\n"
- "add x11, x11, #0x10\n"
- "fmla v19.4s, v8.4s, v10.4s\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x15, x6]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
"ldr q11, [x15, x28]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
"add x15, x15, #0x10\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
- "ldr q12, [x12, x6]\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
- "ldr q10, [x12, x28]\n"
- "add x12, x12, #0x10\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x4]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "st1 { v31.4s }, [x16]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q30, [x16, x17]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "str q29, [x16, x23]\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "str q28, [x16, x22]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "add x16, x16, #0x10\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "st1 { v27.4s }, [x26]\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "str q26, [x26, x17]\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q25, [x26, x23]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "str q24, [x26, x22]\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "add x26, x26, #0x10\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "st1 { v23.4s }, [x25]\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "str q22, [x25, x17]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q21, [x25, x23]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q20, [x25, x22]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "add x25, x25, #0x10\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "st1 { v19.4s }, [x24]\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
"fmax v16.4s, v16.4s, v15.4s\n"
- "str q18, [x24, x17]\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "str q17, [x24, x23]\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x28]\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
- "str q16, [x24, x22]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "st1 { v16.4s }, [x8]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q17, [x8, x5]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q18, [x8, x23]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q19, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "st1 { v20.4s }, [x10]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q21, [x10, x5]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q22, [x10, x23]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q23, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v24.4s }, [x27]\n"
+ "str q25, [x27, x5]\n"
+ "str q26, [x27, x23]\n"
+ "str q27, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v28.4s }, [x24]\n"
+ "str q29, [x24, x5]\n"
+ "str q30, [x24, x23]\n"
+ "str q31, [x24, x22]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 73f\n"
- "ldr q13, [x5, #0x0]\n"
- "ldr q0, [x5, #0x10]\n"
- "add x22, x14, x10\n"
- "ldr q1, [x5, #0x20]\n"
- "add x21, x8, XZR\n"
- "ldr q2, [x5, #0x30]\n"
- "add x20, x8, x27\n"
- "ldr q3, [x5, #0x40]\n"
- "add x19, x14, x9\n"
- "ldr q4, [x5, #0x50]\n"
- "ldr q5, [x5, #0x60]\n"
- "ldr q6, [x5, #0x70]\n"
- "ldr q7, [x5, #0x80]\n"
- "ldr q8, [x5, #0x90]\n"
+ "ldr q13, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x23, x14, x17\n"
+ "add x22, x7, XZR\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x21, x7, x25\n"
+ "add x20, x14, x11\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
- "ldr d9, [x22], #0x8\n"
- "ldr d10, [x21], #0x8\n"
- "ldr d11, [x20], #0x8\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v9.s }[2], [x22]\n"
- "ld1 { v10.s }[2], [x21]\n"
- "ld1 { v11.s }[2], [x20]\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x23]\n"
+ "ld1 { v10.s }[2], [x22]\n"
+ "ld1 { v11.s }[2], [x21]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 6f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
- "ldr s9, [x22, #0x0]\n"
- "ldr s10, [x21, #0x0]\n"
- "ldr s11, [x20, #0x0]\n"
- "ldr s12, [x19, #0x0]\n"
+ "ldr s9, [x23, #0x0]\n"
+ "ldr s10, [x22, #0x0]\n"
+ "ldr s11, [x21, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "add x19, x11, XZR\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "add x20, x26, XZR\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "add x19, x11, x27\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x26, x25\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 10f\n"
"9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "add x19, x13, x10\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x12, x17\n"
"tbz %x[n_channels], #1, 11f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "fmla v27.4s, v8.4s, v9.4s\n"
- "add x19, x8, x6\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "add x20, x7, x4\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 14f\n"
"13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v12.4s\n"
- "add x19, x8, x28\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "add x20, x7, x28\n"
"tbz %x[n_channels], #1, 15f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
- "fmla v29.4s, v2.4s, v11.4s\n"
- "add x19, x13, x9\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "add x20, x12, x11\n"
"tbz %x[n_channels], #1, 17f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 18f\n"
"17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
- "fmla v26.4s, v8.4s, v10.4s\n"
- "add x19, x15, XZR\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "add x20, x15, XZR\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 19f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v9.4s\n"
- "add x19, x15, x27\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "add x20, x15, x25\n"
"tbz %x[n_channels], #1, 21f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 22f\n"
"21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
- "fmla v28.4s, v5.4s, v12.4s\n"
- "add x19, x12, XZR\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "add x20, x9, XZR\n"
"tbz %x[n_channels], #1, 23f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
- "fmla v23.4s, v6.4s, v11.4s\n"
- "add x19, x15, x10\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x15, x17\n"
"tbz %x[n_channels], #1, 25f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 26f\n"
"25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
- "fmla v31.4s, v5.4s, v10.4s\n"
- "add x19, x12, x27\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "add x20, x9, x25\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 27f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
- "fmla v20.4s, v8.4s, v11.4s\n"
- "add x19, x15, x9\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x15, x11\n"
"tbz %x[n_channels], #1, 29f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 30f\n"
"29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "add x19, x11, x6\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "add x20, x26, x4\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 31f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
- "fmla v19.4s, v7.4s, v11.4s\n"
- "add x19, x14, x6\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "add x20, x14, x4\n"
"tbz %x[n_channels], #1, 33f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 34f\n"
"33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v10.4s\n"
- "add x19, x11, x28\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "add x20, x26, x28\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 35f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
- "fmla v17.4s, v8.4s, v11.4s\n"
- "add x19, x14, x28\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "add x20, x14, x28\n"
"tbz %x[n_channels], #1, 37f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 38f\n"
"37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
- "fmla v29.4s, v8.4s, v12.4s\n"
- "add x19, x8, x10\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "add x20, x7, x17\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
"tbz %x[n_channels], #1, 39f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v10.4s\n"
- "add x19, x13, x6\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "add x20, x12, x4\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 41f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 42f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 42f\n"
"41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "fmla v27.4s, v7.4s, v11.4s\n"
- "add x19, x8, x9\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "add x20, x7, x11\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 43f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
- "fmla v30.4s, v2.4s, v12.4s\n"
- "add x19, x14, XZR\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "add x20, x14, XZR\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 45f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 46f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 46f\n"
"45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v10.4s\n"
- "add x19, x13, x28\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "add x20, x12, x28\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 47f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
- "fmla v25.4s, v8.4s, v11.4s\n"
- "add x19, x14, x27\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "add x20, x14, x25\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 49f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 50f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 50f\n"
"49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
- "fmla v28.4s, v8.4s, v12.4s\n"
- "add x19, x13, XZR\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "add x20, x12, XZR\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 51f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
- "fmla v27.4s, v6.4s, v10.4s\n"
- "add x19, x12, x10\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "add x20, x9, x17\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 53f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 54f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 54f\n"
"53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
- "fmla v23.4s, v8.4s, v11.4s\n"
- "add x19, x13, x27\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "add x20, x12, x25\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
"tbz %x[n_channels], #1, 55f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
- "fmla v24.4s, v8.4s, v12.4s\n"
- "add x19, x11, x10\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "add x20, x26, x17\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 57f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 58f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 58f\n"
"57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
- "fmla v19.4s, v8.4s, v10.4s\n"
- "add x19, x12, x9\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x9, x11\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
"tbz %x[n_channels], #1, 59f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 60f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
- "fmla v22.4s, v8.4s, v11.4s\n"
- "add x19, x11, x9\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x20, x26, x11\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
"tbz %x[n_channels], #1, 61f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 62f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 62f\n"
"61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
- "fmla v18.4s, v8.4s, v12.4s\n"
- "add x19, x15, x6\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "add x20, x15, x4\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
"tbz %x[n_channels], #1, 63f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 64f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 64f\n"
"63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
- "fmla v31.4s, v4.4s, v10.4s\n"
- "add x19, x15, x28\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "add x20, x15, x28\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 65f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 66f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 66f\n"
"65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
- "fmla v29.4s, v5.4s, v11.4s\n"
- "add x19, x12, x6\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "add x20, x9, x4\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 67f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 68f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 68f\n"
"67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
- "fmla v23.4s, v7.4s, v12.4s\n"
- "add x19, x12, x28\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "add x20, x9, x28\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 69f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 70f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 70f\n"
"69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
"fmax v16.4s, v16.4s, v15.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 71f\n"
- "mov x19, x16\n"
- "st1 { v31.d }[0], [x19], x17\n"
- "add x16, x16, #0x8\n"
- "st1 { v30.d }[0], [x19], x17\n"
- "mov x21, x26\n"
- "st1 { v29.d }[0], [x19], x17\n"
- "st1 { v27.d }[0], [x21], x17\n"
- "add x26, x26, #0x8\n"
- "st1 { v28.d }[0], [x19]\n"
- "mov x20, x25\n"
- "st1 { v26.d }[0], [x21], x17\n"
- "add x25, x25, #0x8\n"
- "st1 { v25.d }[0], [x21], x17\n"
- "mov x19, x24\n"
- "st1 { v24.d }[0], [x21]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.d }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.d }[0], [x22], x5\n"
+ "st1 { v24.d }[0], [x21], x5\n"
+ "add x8, x8, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x20], x5\n"
+ "add x27, x27, #0x8\n"
"add x24, x24, #0x8\n"
- "st1 { v23.d }[0], [x20], x17\n"
- "st1 { v22.d }[0], [x20], x17\n"
- "st1 { v21.d }[0], [x20], x17\n"
- "st1 { v20.d }[0], [x20]\n"
- "st1 { v19.d }[0], [x19], x17\n"
- "st1 { v18.d }[0], [x19], x17\n"
- "st1 { v17.d }[0], [x19], x17\n"
- "st1 { v16.d }[0], [x19]\n"
+ "st1 { v17.d }[0], [x23], x5\n"
+ "st1 { v21.d }[0], [x22], x5\n"
+ "st1 { v25.d }[0], [x21], x5\n"
+ "st1 { v29.d }[0], [x20], x5\n"
+ "st1 { v18.d }[0], [x23], x5\n"
+ "st1 { v22.d }[0], [x22], x5\n"
+ "st1 { v26.d }[0], [x21], x5\n"
+ "st1 { v30.d }[0], [x20], x5\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 72f\n"
- "mov x22, x16\n"
- "st1 { v31.s }[2], [x22], x17\n"
- "mov x21, x26\n"
- "st1 { v30.s }[2], [x22], x17\n"
- "st1 { v27.s }[2], [x21], x17\n"
- "mov x20, x25\n"
- "st1 { v29.s }[2], [x22], x17\n"
- "mov x19, x24\n"
- "st1 { v28.s }[2], [x22]\n"
- "st1 { v26.s }[2], [x21], x17\n"
- "st1 { v25.s }[2], [x21], x17\n"
- "st1 { v24.s }[2], [x21]\n"
- "st1 { v23.s }[2], [x20], x17\n"
- "st1 { v22.s }[2], [x20], x17\n"
- "st1 { v21.s }[2], [x20], x17\n"
- "st1 { v20.s }[2], [x20]\n"
- "st1 { v19.s }[2], [x19], x17\n"
- "st1 { v18.s }[2], [x19], x17\n"
- "st1 { v17.s }[2], [x19], x17\n"
- "st1 { v16.s }[2], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[2], [x22], x5\n"
+ "st1 { v24.s }[2], [x21], x5\n"
+ "st1 { v28.s }[2], [x20], x5\n"
+ "st1 { v17.s }[2], [x23], x5\n"
+ "st1 { v21.s }[2], [x22], x5\n"
+ "st1 { v25.s }[2], [x21], x5\n"
+ "st1 { v29.s }[2], [x20], x5\n"
+ "st1 { v18.s }[2], [x23], x5\n"
+ "st1 { v22.s }[2], [x22], x5\n"
+ "st1 { v26.s }[2], [x21], x5\n"
+ "st1 { v30.s }[2], [x20], x5\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x22, x16\n"
- "st1 { v31.s }[0], [x22], x17\n"
- "mov x21, x26\n"
- "mov x20, x25\n"
- "st1 { v30.s }[0], [x22], x17\n"
- "st1 { v27.s }[0], [x21], x17\n"
- "mov x19, x24\n"
- "st1 { v29.s }[0], [x22], x17\n"
- "st1 { v28.s }[0], [x22]\n"
- "st1 { v26.s }[0], [x21], x17\n"
- "st1 { v25.s }[0], [x21], x17\n"
- "st1 { v24.s }[0], [x21]\n"
- "st1 { v23.s }[0], [x20], x17\n"
- "st1 { v22.s }[0], [x20], x17\n"
- "st1 { v21.s }[0], [x20], x17\n"
- "st1 { v20.s }[0], [x20]\n"
- "st1 { v19.s }[0], [x19], x17\n"
- "st1 { v18.s }[0], [x19], x17\n"
- "st1 { v17.s }[0], [x19], x17\n"
- "st1 { v16.s }[0], [x19]\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[0], [x22], x5\n"
+ "st1 { v24.s }[0], [x21], x5\n"
+ "st1 { v28.s }[0], [x20], x5\n"
+ "st1 { v17.s }[0], [x23], x5\n"
+ "st1 { v21.s }[0], [x22], x5\n"
+ "st1 { v25.s }[0], [x21], x5\n"
+ "st1 { v29.s }[0], [x20], x5\n"
+ "st1 { v18.s }[0], [x23], x5\n"
+ "st1 { v22.s }[0], [x22], x5\n"
+ "st1 { v26.s }[0], [x21], x5\n"
+ "st1 { v30.s }[0], [x20], x5\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"72:" // Tile loop: Oddments: Store: Bit 1: End
-
"73:" // Tile loop: End
- "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x4, #0x1\n"
"ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x26, x26, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x19\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
"csel x26, x26, XZR, LT\n"
- "csel x4, x4, x21, LT\n"
- "cmp x4, x20\n"
+ "cmp x27, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index aeaf1049f1..2353045021 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,21 +98,21 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x2\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v15.4s }, [x20]\n"
- "ld1r { v14.4s }, [x19]\n"
- "mov x14, #0x0\n"
- "mov x13, #0x10\n" // cntb _, ALL, #1
- "sub x12, XZR, x13\n"
- "lsr x11, %x[n_channels], #0x2\n"
- "cbz x11, 3f\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "sub x12, XZR, x8\n"
+ "cbz x17, 3f\n"
"ldr q13, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
- "cmp x13, x11, LSL #4\n"
+ "cmp x8, x17, LSL #4\n"
"ldr q1, [x15, #0x20]\n"
"ldr q2, [x15, #0x30]\n"
"ldr q3, [x15, #0x40]\n"
@@ -122,589 +122,589 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
"add x15, x15, #0xa0\n"
- "ldp x10, x9, [x16, #0x0]\n"
- "ldp x28, x27, [x16, #0x10]\n"
- "ldr q9, [x10, x14]\n"
- "ldr q10, [x9, x14]\n"
- "ldr q11, [x28, x14]\n"
- "ldr q12, [x27, x14]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "ldr q9, [x11, x13]\n"
+ "ldr q10, [x10, x13]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr q12, [x28, x13]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x20]\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr x26, [x14, #0x30]\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "ldr x25, [x14, #0x28]\n"
+ "ldr x24, [x14, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x11, [x14, #0x40]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr x26, [x14, #0x70]\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x13]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr x27, [x14, #0x60]\n"
+ "ldr x25, [x14, #0x68]\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0x88]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0x78]\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "ldr x11, [x14, #0x80]\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
"add x12, x12, #0x10\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "ldr x10, [x16, #0x40]\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q9, [x24, x14]\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "ldr x28, [x14, #0x98]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
"fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x14]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x14]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "ldr x24, [x16, #0x70]\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr x22, [x17, #0x0]\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "ldr x20, [x17, #0x10]\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "ldr x19, [x17, #0x18]\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v27.4s, v8.4s, v9.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
- "ldr q9, [x28, x14]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x27, [x16, #0x98]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "ldr q13, [x15, #0x0]\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v31.4s, v3.4s, v9.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v23.4s, v6.4s, v11.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "fmla v20.4s, v8.4s, v11.4s\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v19.4s, v7.4s, v11.4s\n"
- "ldr x27, [x16, #0xd8]\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
- "ldr q11, [x28, x14]\n"
- "fmla v31.4s, v7.4s, v10.4s\n"
- "ldr x28, [x16, #0xd0]\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0xb8]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x9, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x25, x13]\n"
+ "ldr x25, [x14, #0xa8]\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "ldr x11, [x14, #0xc0]\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
"fmla v22.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x14]\n"
- "fmla v17.4s, v8.4s, v11.4s\n"
- "ldr x26, [x16, #0xe0]\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x14]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0xc8]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
- "ldr q12, [x24, x14]\n"
- "fmla v31.4s, v2.4s, v10.4s\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
- "ldr q10, [x23, x14]\n"
- "fmla v27.4s, v7.4s, v11.4s\n"
- "ldr x23, [x16, #0xf8]\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x14]\n"
- "fmla v31.4s, v6.4s, v10.4s\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x14]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "ldr x28, [x16, #0x110]\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x14]\n"
- "fmla v28.4s, v8.4s, v12.4s\n"
- "ldr x27, [x16, #0x118]\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "fmla v27.4s, v6.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v24.4s, v8.4s, v12.4s\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v19.4s, v8.4s, v10.4s\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x28, [x14, #0xd8]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr x9, [x14, #0xd0]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x14]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x14]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr x27, [x14, #0xe0]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
- "ldr q9, [x10, x13]\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
- "ldr q12, [x28, x14]\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x14]\n"
- "add x14, x14, #0x10\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "ldr x25, [x14, #0xe8]\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x13]\n"
+ "ldr x26, [x14, #0xf0]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x24, x13]\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x14, #0xf8]\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "ldr x11, [x14, #0x100]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x10, x13]\n"
+ "ldr x10, [x14, #0x108]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x13]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "ldr x9, [x14, #0x110]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "ldr x28, [x14, #0x118]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "ldr q10, [x25, x13]\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x11, x13]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
- "ldp x28, x27, [x16, #0x10]\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x10, x13]\n"
+ "ldp x11, x10, [x14, #0x0]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x13]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x13]\n"
+ "ldr q9, [x11, x8]\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "ldr q13, [x15, #0x0]\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
"ldr q0, [x15, #0x10]\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q11, [x28, x13]\n"
- "fmla v23.4s, v7.4s, v12.4s\n"
"ldr q1, [x15, #0x20]\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
"ldr q6, [x15, #0x70]\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
- "ldr q12, [x27, x13]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
"ldr q7, [x15, #0x80]\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "str q16, [x23, x12]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q17, [x22, x12]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "str q18, [x21, x12]\n"
+ "ldr x22, [x16, #0x28]\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q19, [x20, x12]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
"ldr q5, [x15, #0x60]\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "ldr q10, [x9, x13]\n"
- "add x13, x13, #0x10\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x10, x8]\n"
"ldr q4, [x15, #0x50]\n"
- "cmp x13, x11, LSL #4\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q31, [x22, x12]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "ldr x22, [x17, #0x20]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "str q30, [x21, x12]\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "str q29, [x20, x12]\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "str q28, [x19, x12]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q20, [x23, x12]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q21, [x22, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
"fmax v24.4s, v24.4s, v15.4s\n"
- "ldr x21, [x17, #0x28]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "ldr x20, [x17, #0x30]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "ldr x19, [x17, #0x38]\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q27, [x22, x12]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "str q22, [x21, x12]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q23, [x20, x12]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldp x9, x28, [x14, #0x10]\n"
"fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "ldr q11, [x9, x8]\n"
+ "ldr q12, [x28, x8]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "str q24, [x23, x12]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "str q25, [x22, x12]\n"
+ "ldr x22, [x16, #0x68]\n"
"str q26, [x21, x12]\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q25, [x20, x12]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "str q24, [x19, x12]\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "ldr x22, [x17, #0x40]\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "ldr x21, [x17, #0x48]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "ldr x20, [x17, #0x50]\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q23, [x22, x12]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "str q21, [x20, x12]\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "ldr x19, [x17, #0x58]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "ldr x22, [x17, #0x60]\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "ldr x21, [x17, #0x68]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "str q20, [x19, x12]\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "str q19, [x22, x12]\n"
- "fmax v16.4s, v16.4s, v15.4s\n"
- "str q18, [x21, x12]\n"
- "ldr x20, [x17, #0x70]\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "ldr x19, [x17, #0x78]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "str q17, [x20, x12]\n"
- "str q16, [x19, x12]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "add x8, x8, #0x10\n"
+ "cmp x8, x17, LSL #4\n"
+ "str q27, [x20, x12]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "add x13, x13, #0x10\n"
+ "str q28, [x23, x12]\n"
+ "str q29, [x22, x12]\n"
+ "add x15, x15, #0xa0\n"
+ "str q30, [x21, x12]\n"
+ "str q31, [x20, x12]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x20]\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x14, #0x20]\n"
+ "ldr x26, [x14, #0x30]\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "ldr x25, [x14, #0x28]\n"
+ "ldr x24, [x14, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x11, [x14, #0x40]\n"
+ "ldr x10, [x14, #0x48]\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "ldr x9, [x14, #0x50]\n"
+ "ldr x28, [x14, #0x58]\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr x26, [x14, #0x70]\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x13]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr x27, [x14, #0x60]\n"
+ "ldr x25, [x14, #0x68]\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0x88]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0x78]\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "ldr x11, [x14, #0x80]\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
"add x12, x12, #0x10\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "ldr x10, [x16, #0x40]\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q9, [x24, x14]\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "ldr x28, [x14, #0x98]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "ldr x27, [x14, #0xa0]\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
"fmla v31.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x14]\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "ldr q11, [x25, x14]\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "ldr x24, [x16, #0x70]\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "ldr x22, [x17, #0x0]\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "ldr x21, [x17, #0x8]\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "ldr x20, [x17, #0x10]\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "ldr x19, [x17, #0x18]\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v27.4s, v8.4s, v9.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
- "ldr q9, [x28, x14]\n"
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x27, [x16, #0x98]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v31.4s, v3.4s, v9.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v23.4s, v6.4s, v11.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v31.4s, v5.4s, v10.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x14]\n"
- "fmla v20.4s, v8.4s, v11.4s\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
- "ldr q12, [x27, x14]\n"
- "fmla v19.4s, v7.4s, v11.4s\n"
- "ldr x27, [x16, #0xd8]\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
- "ldr q11, [x28, x14]\n"
- "fmla v31.4s, v7.4s, v10.4s\n"
- "ldr x28, [x16, #0xd0]\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "ldr x26, [x14, #0xb0]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
+ "ldr x24, [x14, #0xb8]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x9, x13]\n"
+ "ldr x9, [x14, #0x90]\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x25, x13]\n"
+ "ldr x25, [x14, #0xa8]\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "ldr x11, [x14, #0xc0]\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
"fmla v22.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x14]\n"
- "fmla v17.4s, v8.4s, v11.4s\n"
- "ldr x26, [x16, #0xe0]\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
- "ldr q11, [x25, x14]\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x10, x13]\n"
+ "ldr x10, [x14, #0xc8]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
- "ldr q12, [x24, x14]\n"
- "fmla v31.4s, v2.4s, v10.4s\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
- "ldr q10, [x23, x14]\n"
- "fmla v27.4s, v7.4s, v11.4s\n"
- "ldr x23, [x16, #0xf8]\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
- "ldr q11, [x10, x14]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
- "ldr q12, [x9, x14]\n"
- "fmla v31.4s, v6.4s, v10.4s\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ldr q10, [x28, x14]\n"
- "fmla v25.4s, v8.4s, v11.4s\n"
- "ldr x28, [x16, #0x110]\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "ldr q11, [x27, x14]\n"
- "fmla v28.4s, v8.4s, v12.4s\n"
- "ldr x27, [x16, #0x118]\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "fmla v27.4s, v6.4s, v10.4s\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
- "ldr q10, [x25, x14]\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v23.4s, v8.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
- "ldr q11, [x24, x14]\n"
- "fmla v24.4s, v8.4s, v12.4s\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
- "ldr q12, [x23, x14]\n"
- "fmla v19.4s, v8.4s, v10.4s\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x28, x13]\n"
+ "ldr x28, [x14, #0xd8]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x9, x13]\n"
+ "ldr x9, [x14, #0xd0]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "ldr q10, [x10, x14]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
- "ldr q11, [x9, x14]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr x27, [x14, #0xe0]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
- "ldr q12, [x28, x14]\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
- "ldr q10, [x27, x14]\n"
- "add x14, x14, #0x10\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x25, x13]\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "ldr x25, [x14, #0xe8]\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x13]\n"
+ "ldr x26, [x14, #0xf0]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x24, x13]\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x14, #0xf8]\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "ldr x11, [x14, #0x100]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x10, x13]\n"
+ "ldr x10, [x14, #0x108]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x13]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "ldr x9, [x14, #0x110]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "ldr x28, [x14, #0x118]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "ldr q10, [x25, x13]\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x26, x13]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x11, x13]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "fmla v23.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x10, x13]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x24, x13]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q31, [x22, x12]\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "ldr x22, [x17, #0x20]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "str q30, [x21, x12]\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "str q29, [x20, x12]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "ldr x21, [x17, #0x28]\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "ldr x20, [x17, #0x30]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "str q28, [x19, x12]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "ldr x19, [x17, #0x38]\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q27, [x22, x12]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "str q26, [x21, x12]\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q25, [x20, x12]\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "ldr x22, [x17, #0x40]\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "ldr x21, [x17, #0x48]\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "ldr x20, [x17, #0x50]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "str q24, [x19, x12]\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "str q23, [x22, x12]\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "str q22, [x21, x12]\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "ldr x19, [x17, #0x58]\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "ldr x22, [x17, #0x60]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "ldr x21, [x17, #0x68]\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "str q21, [x20, x12]\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "str q20, [x19, x12]\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "str q19, [x22, x12]\n"
"fmax v16.4s, v16.4s, v15.4s\n"
- "ldr x20, [x17, #0x70]\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "ldr x19, [x17, #0x78]\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x13]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x13]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
"fmin v17.4s, v17.4s, v14.4s\n"
+ "str q16, [x23, x12]\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "str q17, [x22, x12]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
"str q18, [x21, x12]\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "str q17, [x20, x12]\n"
- "str q16, [x19, x12]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "str q19, [x20, x12]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "ldr x22, [x16, #0x28]\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q20, [x23, x12]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q21, [x22, x12]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "str q22, [x21, x12]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q23, [x20, x12]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q24, [x23, x12]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q25, [x22, x12]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "str q26, [x21, x12]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "str q27, [x20, x12]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q28, [x23, x12]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q29, [x22, x12]\n"
+ "add x13, x13, #0x10\n"
+ "str q30, [x21, x12]\n"
+ "str q31, [x20, x12]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 72f\n"
"ldr q13, [x15, #0x0]\n"
"ldr q0, [x15, #0x10]\n"
- "mov x12, x14\n"
+ "mov x12, x13\n"
"ldr q1, [x15, #0x20]\n"
"ldr q2, [x15, #0x30]\n"
"ldr q3, [x15, #0x40]\n"
@@ -713,683 +713,681 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q6, [x15, #0x70]\n"
"ldr q7, [x15, #0x80]\n"
"ldr q8, [x15, #0x90]\n"
- "ldr x10, [x16, #0x0]\n"
- "add x10, x10, x14\n"
- "ldr x9, [x16, #0x8]\n"
- "ldr x28, [x16, #0x10]\n"
- "add x9, x9, x14\n"
- "ldr x27, [x16, #0x18]\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
+ "ldr x23, [x14, #0x0]\n"
+ "ldr x22, [x14, #0x8]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x21, [x14, #0x10]\n"
+ "ldr x20, [x14, #0x18]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.d }[0], [x10], #0x8\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v9.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x22], #0x8\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 5f\n"
- "ld1 { v9.s }[2], [x10], #0x4\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 5f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
- "ld1 { v9.s }[0], [x10], #0x4\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
- "ld1 { v11.s }[0], [x28], #0x4\n"
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x20]\n"
- "add x26, x26, x14\n"
- "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
- "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v30.4s, v8.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v12.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
- "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x20, x20, x13\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load input (5, 0): Bit 1: Unset
- "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (5, 0): Bit 1: End
- "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
- "ldr x25, [x16, #0x28]\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x14, #0x28]\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 9f\n"
"8:" // Oddments: Load input (5, 5): Bit 1: Unset
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (5, 5): Bit 1: End
- "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
- "ldr x24, [x16, #0x30]\n"
- "add x24, x24, x14\n"
+ "ldr x20, [x14, #0x30]\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Load input (3, 2): Bit 1: Unset
- "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (3, 2): Bit 1: End
- "fmla v27.4s, v8.4s, v9.4s\n"
- "ldr x23, [x16, #0x38]\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "add x23, x23, x14\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v21.4s, v3.4s, v9.4s\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
- "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
- "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 13f\n"
"12:" // Oddments: Load input (0, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 1): Bit 1: End
- "fmla v31.4s, v1.4s, v12.4s\n"
- "ldr x10, [x16, #0x40]\n"
- "fmla v30.4s, v0.4s, v12.4s\n"
- "add x10, x10, x14\n"
+ "ldr x20, [x14, #0x40]\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 15f\n"
"14:" // Oddments: Load input (0, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (0, 4): Bit 1: End
- "fmla v29.4s, v2.4s, v11.4s\n"
- "ldr x9, [x16, #0x48]\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "add x9, x9, x14\n"
+ "ldr x20, [x14, #0x48]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: Load input (3, 3): Bit 1: Unset
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (3, 3): Bit 1: End
- "fmla v26.4s, v8.4s, v10.4s\n"
- "ldr x28, [x16, #0x50]\n"
- "fmla v25.4s, v7.4s, v10.4s\n"
- "add x28, x28, x14\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v21.4s, v4.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v18.4s, v2.4s, v10.4s\n"
- "fmla v17.4s, v1.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0x50]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v9.d }[0], [x28], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v9.s }[2], [x28], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 19f\n"
"18:" // Oddments: Load input (1, 0): Bit 1: Unset
- "ld1 { v9.s }[0], [x28], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 0): Bit 1: End
- "fmla v31.4s, v3.4s, v9.4s\n"
- "ldr x27, [x16, #0x58]\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "add x27, x27, x14\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 21f\n"
"20:" // Oddments: Load input (1, 5): Bit 1: Unset
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (1, 5): Bit 1: End
- "fmla v28.4s, v5.4s, v12.4s\n"
- "ldr x26, [x16, #0x60]\n"
- "fmla v24.4s, v2.4s, v12.4s\n"
- "add x26, x26, x14\n"
+ "ldr x20, [x14, #0x60]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 23f\n"
"22:" // Oddments: Load input (4, 0): Bit 1: Unset
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (4, 0): Bit 1: End
- "fmla v23.4s, v6.4s, v11.4s\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla v19.4s, v3.4s, v11.4s\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x14, #0x68]\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 25f\n"
"24:" // Oddments: Load input (1, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (1, 2): Bit 1: End
- "fmla v31.4s, v5.4s, v10.4s\n"
- "ldr x24, [x16, #0x70]\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
- "add x24, x24, x14\n"
- "fmla v29.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0x70]\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 27f\n"
"26:" // Oddments: Load input (4, 5): Bit 1: Unset
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 5): Bit 1: End
- "fmla v20.4s, v8.4s, v11.4s\n"
- "ldr x23, [x16, #0x78]\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "add x23, x23, x14\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 29f\n"
"28:" // Oddments: Load input (1, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (1, 3): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr x10, [x16, #0x80]\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
- "add x10, x10, x14\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v26.4s, v2.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v24.4s, v0.4s, v12.4s\n"
+ "ldr x20, [x14, #0x80]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 31f\n"
"30:" // Oddments: Load input (5, 1): Bit 1: Unset
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (5, 1): Bit 1: End
- "fmla v19.4s, v7.4s, v11.4s\n"
- "ldr x9, [x16, #0x88]\n"
- "fmla v18.4s, v6.4s, v11.4s\n"
- "add x9, x9, x14\n"
+ "ldr x20, [x14, #0x88]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 33f\n"
"32:" // Oddments: Load input (2, 1): Bit 1: Unset
- "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v10.4s\n"
- "ldr x28, [x16, #0x90]\n"
- "fmla v30.4s, v6.4s, v10.4s\n"
- "add x28, x28, x14\n"
- "fmla v27.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0x90]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 35f\n"
"34:" // Oddments: Load input (5, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (5, 4): Bit 1: End
- "fmla v17.4s, v8.4s, v11.4s\n"
- "ldr x27, [x16, #0x98]\n"
- "fmla v16.4s, v7.4s, v11.4s\n"
- "add x27, x27, x14\n"
+ "ldr x20, [x14, #0x98]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
- "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 37f\n"
"36:" // Oddments: Load input (2, 4): Bit 1: Unset
- "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (2, 4): Bit 1: End
- "fmla v29.4s, v8.4s, v12.4s\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "add x26, x26, x14\n"
- "fmla v25.4s, v5.4s, v12.4s\n"
- "fmla v24.4s, v4.4s, v12.4s\n"
- "fmla v21.4s, v2.4s, v12.4s\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x14, #0xa0]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 39f\n"
"38:" // Oddments: Load input (0, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (0, 2): Bit 1: End
- "fmla v31.4s, v2.4s, v10.4s\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla v30.4s, v1.4s, v10.4s\n"
- "add x25, x25, x14\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0xa8]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
- "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 41f\n"
"40:" // Oddments: Load input (3, 1): Bit 1: Unset
- "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (3, 1): Bit 1: End
- "fmla v27.4s, v7.4s, v11.4s\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "add x24, x24, x14\n"
- "fmla v23.4s, v4.4s, v11.4s\n"
- "fmla v22.4s, v3.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "fmla v18.4s, v0.4s, v11.4s\n"
+ "ldr x20, [x14, #0xb0]\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 43f\n"
"42:" // Oddments: Load input (0, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (0, 3): Bit 1: End
- "fmla v30.4s, v2.4s, v12.4s\n"
- "ldr x23, [x16, #0xb8]\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "add x23, x23, x14\n"
- "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr x20, [x14, #0xb8]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
- "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 45f\n"
"44:" // Oddments: Load input (2, 0): Bit 1: Unset
- "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"45:" // Oddments: Load input (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v10.4s\n"
- "ldr x10, [x16, #0xc0]\n"
- "fmla v27.4s, v3.4s, v10.4s\n"
- "add x10, x10, x14\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0xc0]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v11.s }[2], [x10], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 47f\n"
"46:" // Oddments: Load input (3, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x10], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"47:" // Oddments: Load input (3, 4): Bit 1: End
- "fmla v25.4s, v8.4s, v11.4s\n"
- "ldr x9, [x16, #0xc8]\n"
- "fmla v24.4s, v7.4s, v11.4s\n"
- "add x9, x9, x14\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "fmla v20.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
+ "ldr x20, [x14, #0xc8]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v12.d }[0], [x9], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 49f\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 49f\n"
"48:" // Oddments: Load input (2, 5): Bit 1: Unset
- "ld1 { v12.s }[0], [x9], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"49:" // Oddments: Load input (2, 5): Bit 1: End
- "fmla v28.4s, v8.4s, v12.4s\n"
- "ldr x28, [x16, #0xd0]\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
- "add x28, x28, x14\n"
- "fmla v20.4s, v2.4s, v12.4s\n"
+ "ldr x20, [x14, #0xd0]\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v10.d }[0], [x28], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v10.s }[2], [x28], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 51f\n"
"50:" // Oddments: Load input (3, 0): Bit 1: Unset
- "ld1 { v10.s }[0], [x28], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"51:" // Oddments: Load input (3, 0): Bit 1: End
- "fmla v27.4s, v6.4s, v10.4s\n"
- "ldr x27, [x16, #0xd8]\n"
- "fmla v23.4s, v3.4s, v10.4s\n"
- "add x27, x27, x14\n"
- "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0xd8]\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v11.d }[0], [x27], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 53f\n"
- "ld1 { v11.s }[2], [x27], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 53f\n"
"52:" // Oddments: Load input (4, 2): Bit 1: Unset
- "ld1 { v11.s }[0], [x27], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"53:" // Oddments: Load input (4, 2): Bit 1: End
- "fmla v23.4s, v8.4s, v11.4s\n"
- "ldr x26, [x16, #0xe0]\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "add x26, x26, x14\n"
- "fmla v21.4s, v6.4s, v11.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v17.4s, v3.4s, v11.4s\n"
+ "ldr x20, [x14, #0xe0]\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 55f\n"
"54:" // Oddments: Load input (3, 5): Bit 1: Unset
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"55:" // Oddments: Load input (3, 5): Bit 1: End
- "fmla v24.4s, v8.4s, v12.4s\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla v20.4s, v5.4s, v12.4s\n"
- "add x25, x25, x14\n"
- "fmla v16.4s, v2.4s, v12.4s\n"
+ "ldr x20, [x14, #0xe8]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 57f\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 57f\n"
"56:" // Oddments: Load input (5, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 2): Bit 1: End
- "fmla v19.4s, v8.4s, v10.4s\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla v18.4s, v7.4s, v10.4s\n"
- "add x24, x24, x14\n"
- "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr x20, [x14, #0xf0]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
"tbz %x[n_channels], #1, 58f\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 59f\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 59f\n"
"58:" // Oddments: Load input (4, 3): Bit 1: Unset
- "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"59:" // Oddments: Load input (4, 3): Bit 1: End
- "fmla v22.4s, v8.4s, v11.4s\n"
- "ldr x23, [x16, #0xf8]\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "add x23, x23, x14\n"
- "fmla v20.4s, v6.4s, v11.4s\n"
- "fmla v18.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v16.4s, v3.4s, v11.4s\n"
+ "ldr x20, [x14, #0xf8]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
"tbz %x[n_channels], #1, 60f\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 61f\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 61f\n"
"60:" // Oddments: Load input (5, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"61:" // Oddments: Load input (5, 3): Bit 1: End
- "fmla v18.4s, v8.4s, v12.4s\n"
- "ldr x10, [x16, #0x100]\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "add x10, x10, x14\n"
- "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr x20, [x14, #0x100]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
"tbz %x[n_channels], #1, 62f\n"
- "ld1 { v10.d }[0], [x10], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 63f\n"
- "ld1 { v10.s }[2], [x10], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 63f\n"
"62:" // Oddments: Load input (1, 1): Bit 1: Unset
- "ld1 { v10.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"63:" // Oddments: Load input (1, 1): Bit 1: End
- "fmla v31.4s, v4.4s, v10.4s\n"
- "ldr x9, [x16, #0x108]\n"
- "fmla v30.4s, v3.4s, v10.4s\n"
- "add x9, x9, x14\n"
- "fmla v27.4s, v1.4s, v10.4s\n"
- "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x14, #0x108]\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 64f\n"
- "ld1 { v11.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 65f\n"
- "ld1 { v11.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 65f\n"
"64:" // Oddments: Load input (1, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"65:" // Oddments: Load input (1, 4): Bit 1: End
- "fmla v29.4s, v5.4s, v11.4s\n"
- "ldr x28, [x16, #0x110]\n"
- "fmla v28.4s, v4.4s, v11.4s\n"
- "add x28, x28, x14\n"
- "fmla v25.4s, v2.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr x20, [x14, #0x110]\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 66f\n"
- "ld1 { v12.d }[0], [x28], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 67f\n"
- "ld1 { v12.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 67f\n"
"66:" // Oddments: Load input (4, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"67:" // Oddments: Load input (4, 1): Bit 1: End
- "fmla v23.4s, v7.4s, v12.4s\n"
- "ldr x27, [x16, #0x118]\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "add x27, x27, x14\n"
- "fmla v19.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
+ "ldr x20, [x14, #0x118]\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 68f\n"
- "ld1 { v10.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 69f\n"
- "ld1 { v10.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 69f\n"
"68:" // Oddments: Load input (4, 4): Bit 1: Unset
- "ld1 { v10.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"69:" // Oddments: Load input (4, 4): Bit 1: End
- "fmla v21.4s, v8.4s, v10.4s\n"
- "fmla v20.4s, v7.4s, v10.4s\n"
- "fmla v17.4s, v5.4s, v10.4s\n"
- "fmla v16.4s, v4.4s, v10.4s\n"
- "fmax v31.4s, v31.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v15.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v15.4s\n"
- "fmax v18.4s, v18.4s, v15.4s\n"
- "fmax v17.4s, v17.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
"fmax v16.4s, v16.4s, v15.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 70f\n"
- "ldr x22, [x17, #0x0]\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
"add x22, x22, x12\n"
- "ldr x20, [x17, #0x10]\n"
- "ldr x19, [x17, #0x18]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x21, x21, x12\n"
- "st1 { v31.d }[0], [x22]\n"
"add x20, x20, x12\n"
- "st1 { v30.d }[0], [x21]\n"
- "ldr x22, [x17, #0x20]\n"
- "add x19, x19, x12\n"
- "st1 { v29.d }[0], [x20]\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
"add x22, x22, x12\n"
- "st1 { v28.d }[0], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x21, x21, x12\n"
- "st1 { v27.d }[0], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
+ "st1 { v19.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x20, x20, x12\n"
- "st1 { v26.d }[0], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.d }[0], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
"add x22, x22, x12\n"
- "st1 { v24.d }[0], [x19]\n"
- "ldr x21, [x17, #0x48]\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
"add x21, x21, x12\n"
- "st1 { v23.d }[0], [x22]\n"
- "ldr x20, [x17, #0x50]\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
"add x20, x20, x12\n"
- "st1 { v22.d }[0], [x21]\n"
- "ldr x19, [x17, #0x58]\n"
- "add x19, x19, x12\n"
- "st1 { v21.d }[0], [x20]\n"
- "ldr x22, [x17, #0x60]\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
"add x22, x22, x12\n"
- "st1 { v20.d }[0], [x19]\n"
- "ldr x21, [x17, #0x68]\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
"add x21, x21, x12\n"
- "st1 { v19.d }[0], [x22]\n"
- "ldr x20, [x17, #0x70]\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
"add x20, x20, x12\n"
- "st1 { v18.d }[0], [x21]\n"
- "ldr x19, [x17, #0x78]\n"
- "add x19, x19, x12\n"
- "st1 { v17.d }[0], [x20]\n"
"add x12, x12, #0x8\n"
- "st1 { v16.d }[0], [x19]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 71f\n"
- "ldr x22, [x17, #0x0]\n"
- "ldr x21, [x17, #0x8]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
"add x22, x22, x12\n"
- "ldr x20, [x17, #0x10]\n"
- "ldr x19, [x17, #0x18]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x21, x21, x12\n"
- "st1 { v31.s }[2], [x22]\n"
"add x20, x20, x12\n"
- "st1 { v30.s }[2], [x21]\n"
- "ldr x22, [x17, #0x20]\n"
- "add x19, x19, x12\n"
- "st1 { v29.s }[2], [x20]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
"add x22, x22, x12\n"
- "st1 { v28.s }[2], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x21, x21, x12\n"
- "st1 { v27.s }[2], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
+ "st1 { v19.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x20, x20, x12\n"
- "st1 { v26.s }[2], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.s }[2], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
"add x22, x22, x12\n"
- "st1 { v24.s }[2], [x19]\n"
- "ldr x21, [x17, #0x48]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
"add x21, x21, x12\n"
- "st1 { v23.s }[2], [x22]\n"
- "ldr x20, [x17, #0x50]\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
"add x20, x20, x12\n"
- "st1 { v22.s }[2], [x21]\n"
- "ldr x19, [x17, #0x58]\n"
- "add x19, x19, x12\n"
- "st1 { v21.s }[2], [x20]\n"
- "ldr x22, [x17, #0x60]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
"add x22, x22, x12\n"
- "st1 { v20.s }[2], [x19]\n"
- "ldr x21, [x17, #0x68]\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
"add x21, x21, x12\n"
- "st1 { v19.s }[2], [x22]\n"
- "ldr x20, [x17, #0x70]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
"add x20, x20, x12\n"
- "st1 { v18.s }[2], [x21]\n"
- "ldr x19, [x17, #0x78]\n"
- "add x19, x19, x12\n"
- "st1 { v17.s }[2], [x20]\n"
- "st1 { v16.s }[2], [x19]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Store: Bit 1: Unset
- "ldr x22, [x17, #0x0]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x12\n"
"add x22, x22, x12\n"
- "ldr x21, [x17, #0x8]\n"
- "ldr x20, [x17, #0x10]\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
"add x21, x21, x12\n"
- "st1 { v31.s }[0], [x22]\n"
- "ldr x19, [x17, #0x18]\n"
"add x20, x20, x12\n"
- "st1 { v30.s }[0], [x21]\n"
- "add x19, x19, x12\n"
- "st1 { v29.s }[0], [x20]\n"
- "ldr x22, [x17, #0x20]\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x20]\n"
+ "add x23, x23, x12\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x28]\n"
"add x22, x22, x12\n"
- "st1 { v28.s }[0], [x19]\n"
- "ldr x21, [x17, #0x28]\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x30]\n"
"add x21, x21, x12\n"
- "st1 { v27.s }[0], [x22]\n"
- "ldr x20, [x17, #0x30]\n"
+ "st1 { v19.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x38]\n"
"add x20, x20, x12\n"
- "st1 { v26.s }[0], [x21]\n"
- "ldr x19, [x17, #0x38]\n"
- "add x19, x19, x12\n"
- "st1 { v25.s }[0], [x20]\n"
- "ldr x22, [x17, #0x40]\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "add x23, x23, x12\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x48]\n"
"add x22, x22, x12\n"
- "st1 { v24.s }[0], [x19]\n"
- "ldr x21, [x17, #0x48]\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x50]\n"
"add x21, x21, x12\n"
- "st1 { v23.s }[0], [x22]\n"
- "ldr x20, [x17, #0x50]\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x58]\n"
"add x20, x20, x12\n"
- "st1 { v22.s }[0], [x21]\n"
- "ldr x19, [x17, #0x58]\n"
- "add x19, x19, x12\n"
- "st1 { v21.s }[0], [x20]\n"
- "ldr x22, [x17, #0x60]\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "add x23, x23, x12\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x22, [x16, #0x68]\n"
"add x22, x22, x12\n"
- "st1 { v20.s }[0], [x19]\n"
- "ldr x21, [x17, #0x68]\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x16, #0x70]\n"
"add x21, x21, x12\n"
- "st1 { v19.s }[0], [x22]\n"
- "ldr x20, [x17, #0x70]\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x16, #0x78]\n"
"add x20, x20, x12\n"
- "st1 { v18.s }[0], [x21]\n"
- "ldr x19, [x17, #0x78]\n"
- "add x19, x19, x12\n"
- "st1 { v17.s }[0], [x20]\n"
- "st1 { v16.s }[0], [x19]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"71:" // Oddments: Store: Bit 1: End
-
"72:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 65e487ddbb..e42ceffb50 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,526 +87,526 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x6, #0x0\n"
+ "mov x23, #0x0\n"
"mov x27, #0x0\n"
"1:" // Tile loop
- "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x26, #0x4\n"
- "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"mov x25, #0x2\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x24, %x[params_struct], %[offsetof_args_min]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "add x21, %x[params_struct], %[offsetof_args_max]\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mov x22, #0x0\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
- "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
- "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x17, x17, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1r { v19.4s }, [x24]\n"
- "add x14, x17, x23, LSL #2\n"
- "ld1r { v18.4s }, [x21]\n"
- "add x13, x14, x23, LSL #2\n"
- "lsl x8, x8, #0x2\n"
- "add x12, x13, x23, LSL #2\n"
- "add x11, x12, x23, LSL #2\n"
- "add x10, x8, x8\n"
- "add x9, x10, x8\n"
- "add x28, x9, x8\n"
- "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
- "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x25\n" // offset *= output_tile_size
- "add x15, x15, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x27, x15, x20, LSL #2\n"
- "lsl x16, x16, #0x2\n"
- "mov x21, #0x10\n" // cntb _, ALL, #1
- "sub x20, XZR, x21\n"
- "lsr x19, %x[n_channels], #0x2\n"
- "cbz x19, 4f\n"
- "ldr q17, [x7, #0x0]\n"
- "ldr q0, [x7, #0x10]\n"
- "cmp x21, x19, LSL #4\n"
- "ldr q1, [x7, #0x20]\n"
- "ldr q2, [x7, #0x30]\n"
- "ldr q3, [x7, #0x40]\n"
- "ldr q4, [x7, #0x50]\n"
- "ldr q5, [x7, #0x60]\n"
- "ldr q6, [x7, #0x70]\n"
- "ldr q7, [x7, #0x80]\n"
- "ldr q8, [x7, #0x90]\n"
- "add x7, x7, #0xa0\n"
- "ldr q9, [x13, x10]\n"
- "ld1 { v10.4s }, [x17]\n"
- "ldr q11, [x17, x8]\n"
- "ldr q12, [x17, x9]\n"
- "ldr q13, [x17, x28]\n"
- "ld1 { v14.4s }, [x14]\n"
- "ldr q15, [x14, x8]\n"
- "ldr q16, [x17, x10]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x6, x6, #0x2\n"
+ "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x16, x8, x24, LSL #2\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x14, x16, x24, LSL #2\n"
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "add x13, x6, x6\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x13, x6\n"
+ "add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "add x10, x12, x24, LSL #2\n"
+ "add x9, x11, x6\n"
+ "add x28, x17, x21, LSL #2\n"
+ "lsl x7, x7, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldr q9, [x14, x13]\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q12, [x8, x11]\n"
+ "ldr q13, [x8, x9]\n"
+ "ld1 { v14.4s }, [x16]\n"
+ "ldr q15, [x16, x6]\n"
+ "ldr q16, [x8, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "add x23, x23, #0x10\n"
+ "add x8, x8, #0x10\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x16, x9]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x11]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x16, x13]\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x12]\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x14]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x12, x6]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x14, x6]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q17, [x15, #0x0]\n"
+ "cmp x23, x22, LSL #4\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x12, x11]\n"
"add x20, x20, #0x10\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "add x22, x22, #0x10\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "add x17, x17, #0x10\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ldr q17, [x7, #0x0]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x12, x9]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x10, x6]\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
"add x21, x21, #0x10\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x17]\n"
- "cmp x21, x19, LSL #4\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "ldr q13, [x14, x10]\n"
- "add x14, x14, #0x10\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
- "ld1 { v14.4s }, [x12]\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x13]\n"
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x28]\n"
"fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
- "ldr q16, [x13, x8]\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
- "ldr q0, [x7, #0x10]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x13, x9]\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x13, x28]\n"
- "add x13, x13, #0x10\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x9]\n"
- "ldr q9, [x13, x10]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x11]\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x11, x8]\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "ldr q4, [x7, #0x50]\n"
- "fmla v31.4s, v7.4s, v16.4s\n"
- "ldr q16, [x12, x10]\n"
- "add x12, x12, #0x10\n"
- "fmla v29.4s, v6.4s, v15.4s\n"
- "ldr q15, [x11, x10]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q12, [x17, x9]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "ldr q1, [x7, #0x20]\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr q13, [x17, x28]\n"
- "fmla v28.4s, v5.4s, v14.4s\n"
- "ldr q14, [x11, x9]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "st1 { v31.4s }, [x15]\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
- "ldr q11, [x11, x28]\n"
- "add x11, x11, #0x10\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "ldr q2, [x7, #0x30]\n"
- "ldr q5, [x7, #0x60]\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
- "ldr q16, [x17, x10]\n"
- "fmla v29.4s, v8.4s, v15.4s\n"
- "str q30, [x15, x16]\n"
- "add x15, x15, #0x10\n"
- "fmla v28.4s, v7.4s, v14.4s\n"
- "ld1 { v14.4s }, [x14]\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "ldr q3, [x7, #0x40]\n"
- "ldr q7, [x7, #0x80]\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "st1 { v29.4s }, [x27]\n"
+ "ldr q11, [x14, x9]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x10, x11]\n"
"fmla v28.4s, v6.4s, v15.4s\n"
- "ldr q15, [x14, x8]\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x17, x8]\n"
- "ldr q6, [x7, #0x70]\n"
+ "ld1 { v15.4s }, [x10]\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x12, x13]\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x10, x13]\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x8, x9]\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x16]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x8, x11]\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x8, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x10, x9]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q15, [x16, x6]\n"
"fmax v28.4s, v28.4s, v19.4s\n"
- "ldr q8, [x7, #0x90]\n"
- "add x7, x7, #0xa0\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x13]\n"
"fmin v28.4s, v28.4s, v18.4s\n"
- "str q28, [x27, x16]\n"
- "add x27, x27, #0x10\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "add x12, x12, #0x10\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v28.4s }, [x17]\n"
+ "add x15, x15, #0xa0\n"
+ "str q29, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "st1 { v30.4s }, [x28]\n"
+ "str q31, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "add x17, x17, #0x10\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x14, x28]\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x14, x9]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "ldr q13, [x14, x10]\n"
- "add x14, x14, #0x10\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "add x8, x8, #0x10\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x16, x9]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x11]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x16, x13]\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
"ld1 { v14.4s }, [x12]\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "ld1 { v15.4s }, [x13]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x12, x8]\n"
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ldr q14, [x12, x28]\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
- "ldr q16, [x13, x8]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x13, x9]\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
- "ldr q13, [x12, x9]\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x14]\n"
"fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x13, x28]\n"
- "add x13, x13, #0x10\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "ld1 { v15.4s }, [x11]\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x11, x8]\n"
- "fmla v31.4s, v7.4s, v16.4s\n"
- "ldr q16, [x12, x10]\n"
- "add x12, x12, #0x10\n"
- "fmla v29.4s, v6.4s, v15.4s\n"
- "ldr q15, [x11, x10]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmla v28.4s, v5.4s, v14.4s\n"
- "ldr q14, [x11, x9]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "st1 { v31.4s }, [x15]\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
- "ldr q11, [x11, x28]\n"
- "add x11, x11, #0x10\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "str q30, [x15, x16]\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
- "add x15, x15, #0x10\n"
- "fmla v29.4s, v8.4s, v15.4s\n"
- "fmla v28.4s, v7.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
+ "ldr q11, [x12, x6]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x14, x6]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x12, x11]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x12, x9]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x10, x6]\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x10, x11]\n"
"fmla v28.4s, v6.4s, v15.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "st1 { v29.4s }, [x27]\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
+ "ld1 { v15.4s }, [x10]\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x12, x13]\n"
"fmax v28.4s, v28.4s, v19.4s\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x10, x13]\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
"fmin v28.4s, v28.4s, v18.4s\n"
- "str q28, [x27, x16]\n"
- "add x27, x27, #0x10\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "st1 { v28.4s }, [x17]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x10, x9]\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "str q29, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "st1 { v30.4s }, [x28]\n"
+ "str q31, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 43f\n"
- "ldr q17, [x7, #0x0]\n"
- "ldr q0, [x7, #0x10]\n"
- "add x26, x13, x10\n"
- "ldr q1, [x7, #0x20]\n"
- "add x25, x17, XZR\n"
- "ldr q2, [x7, #0x30]\n"
- "add x24, x17, x8\n"
- "ldr q3, [x7, #0x40]\n"
- "add x23, x17, x9\n"
- "ldr q4, [x7, #0x50]\n"
- "add x22, x17, x28\n"
- "ldr q5, [x7, #0x60]\n"
- "add x21, x14, XZR\n"
- "ldr q6, [x7, #0x70]\n"
- "add x20, x14, x8\n"
- "ldr q7, [x7, #0x80]\n"
- "add x19, x17, x10\n"
- "ldr q8, [x7, #0x90]\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "add x27, x14, x13\n"
+ "add x26, x8, XZR\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x25, x8, x6\n"
+ "add x24, x8, x11\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x23, x8, x9\n"
+ "add x22, x16, XZR\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "add x21, x16, x6\n"
+ "add x20, x8, x13\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
- "ldr d9, [x26], #0x8\n"
- "ldr d10, [x25], #0x8\n"
- "ldr d11, [x24], #0x8\n"
- "ldr d12, [x23], #0x8\n"
- "ldr d13, [x22], #0x8\n"
- "ldr d14, [x21], #0x8\n"
- "ldr d15, [x20], #0x8\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d9, [x27], #0x8\n"
+ "ldr d10, [x26], #0x8\n"
+ "ldr d11, [x25], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d14, [x22], #0x8\n"
+ "ldr d15, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v9.s }[2], [x26]\n"
- "ld1 { v10.s }[2], [x25]\n"
- "ld1 { v11.s }[2], [x24]\n"
- "ld1 { v12.s }[2], [x23]\n"
- "ld1 { v13.s }[2], [x22]\n"
- "ld1 { v14.s }[2], [x21]\n"
- "ld1 { v15.s }[2], [x20]\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x27]\n"
+ "ld1 { v10.s }[2], [x26]\n"
+ "ld1 { v11.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x21]\n"
+ "ld1 { v16.s }[2], [x20]\n"
"b 6f\n"
"5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
- "ldr s9, [x26, #0x0]\n"
- "ldr s10, [x25, #0x0]\n"
- "ldr s11, [x24, #0x0]\n"
- "ldr s12, [x23, #0x0]\n"
- "ldr s13, [x22, #0x0]\n"
- "ldr s14, [x21, #0x0]\n"
- "ldr s15, [x20, #0x0]\n"
- "ldr s16, [x19, #0x0]\n"
+ "ldr s9, [x27, #0x0]\n"
+ "ldr s10, [x26, #0x0]\n"
+ "ldr s11, [x25, #0x0]\n"
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s14, [x22, #0x0]\n"
+ "ldr s15, [x21, #0x0]\n"
+ "ldr s16, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "add x19, x14, x9\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "add x20, x16, x11\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 7f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
- "fmla v30.4s, v4.4s, v11.4s\n"
- "add x19, x14, x28\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x20, x16, x9\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 10f\n"
"9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "add x19, x14, x10\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "add x20, x16, x13\n"
"tbz %x[n_channels], #1, 11f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
- "fmla v31.4s, v5.4s, v13.4s\n"
- "add x19, x12, XZR\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "add x20, x12, XZR\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v14.s }[2], [x19]\n"
+ "ld1 { v14.s }[2], [x20]\n"
"b 14f\n"
"13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
- "ldr s14, [x19, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
- "fmla v29.4s, v3.4s, v14.4s\n"
- "add x19, x13, XZR\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "add x20, x14, XZR\n"
"tbz %x[n_channels], #1, 15f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
- "ldr s15, [x19, #0x0]\n"
+ "ldr s15, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v15.4s\n"
- "add x19, x12, x8\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "add x20, x12, x6\n"
"tbz %x[n_channels], #1, 17f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 18f\n"
"17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "fmla v29.4s, v4.4s, v11.4s\n"
- "add x19, x13, x8\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "add x20, x14, x6\n"
"tbz %x[n_channels], #1, 19f\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v16.s }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
- "ldr s16, [x19, #0x0]\n"
+ "ldr s16, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v16.4s\n"
- "add x19, x12, x9\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "add x20, x12, x11\n"
"tbz %x[n_channels], #1, 21f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 22f\n"
"21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
- "fmla v28.4s, v4.4s, v13.4s\n"
- "add x19, x13, x9\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "add x20, x14, x11\n"
"tbz %x[n_channels], #1, 23f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
- "fmla v30.4s, v7.4s, v12.4s\n"
- "add x19, x12, x28\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x20, x12, x9\n"
"tbz %x[n_channels], #1, 25f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
- "ld1 { v14.s }[2], [x19]\n"
+ "ld1 { v14.s }[2], [x20]\n"
"b 26f\n"
"25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
- "ldr s14, [x19, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
- "fmla v28.4s, v5.4s, v14.4s\n"
- "add x19, x11, XZR\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #1, 27f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
- "ldr s15, [x19, #0x0]\n"
+ "ldr s15, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
- "fmla v29.4s, v6.4s, v15.4s\n"
- "add x19, x13, x28\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "add x20, x14, x9\n"
"tbz %x[n_channels], #1, 29f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 30f\n"
"29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
- "fmla v30.4s, v8.4s, v11.4s\n"
- "add x19, x11, x8\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x10, x6\n"
"tbz %x[n_channels], #1, 31f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
- "fmla v29.4s, v7.4s, v13.4s\n"
- "add x19, x12, x10\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "add x20, x12, x13\n"
"tbz %x[n_channels], #1, 33f\n"
- "ldr d16, [x19], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v16.s }[2], [x20]\n"
"b 34f\n"
"33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
- "ldr s16, [x19, #0x0]\n"
+ "ldr s16, [x20, #0x0]\n"
"34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "fmla v29.4s, v5.4s, v16.4s\n"
- "add x19, x11, x9\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #1, 35f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v14.s }[2], [x19]\n"
+ "ld1 { v14.s }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
- "ldr s14, [x19, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
- "fmla v28.4s, v7.4s, v14.4s\n"
- "add x19, x11, x10\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #1, 37f\n"
- "ldr d15, [x19], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "ld1 { v15.s }[2], [x20]\n"
"b 38f\n"
"37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
- "ldr s15, [x19, #0x0]\n"
+ "ldr s15, [x20, #0x0]\n"
"38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
- "fmla v29.4s, v8.4s, v15.4s\n"
- "add x19, x11, x28\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "add x20, x10, x9\n"
"tbz %x[n_channels], #1, 39f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
"fmax v28.4s, v28.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
"fmin v28.4s, v28.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
"tbz %x[n_channels], #1, 41f\n"
- "mov x19, x15\n"
- "st1 { v31.d }[0], [x19], x16\n"
- "add x15, x15, #0x8\n"
- "st1 { v30.d }[0], [x19]\n"
- "mov x19, x27\n"
- "st1 { v29.d }[0], [x19], x16\n"
- "add x27, x27, #0x8\n"
- "st1 { v28.d }[0], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 42f\n"
- "mov x20, x15\n"
- "st1 { v31.s }[2], [x20], x16\n"
- "mov x19, x27\n"
- "st1 { v30.s }[2], [x20]\n"
- "st1 { v29.s }[2], [x19], x16\n"
- "st1 { v28.s }[2], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[2], [x21], x7\n"
+ "st1 { v30.s }[2], [x20], x7\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 42f\n"
"41:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x20, x15\n"
- "st1 { v31.s }[0], [x20], x16\n"
- "mov x19, x27\n"
- "st1 { v30.s }[0], [x20]\n"
- "st1 { v29.s }[0], [x19], x16\n"
- "st1 { v28.s }[0], [x19]\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[0], [x21], x7\n"
+ "st1 { v30.s }[0], [x20], x7\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"42:" // Tile loop: Oddments: Store: Bit 1: End
"43:" // Tile loop: End
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x6, #0x1\n"
"ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"add x27, x27, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x27, x19\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
"csel x27, x27, XZR, LT\n"
- "csel x6, x6, x21, LT\n"
- "cmp x6, x20\n"
+ "cmp x23, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 4b24862eb2..f65633002e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,385 +88,385 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "lsr x25, %x[n_channels], #0x2\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v19.4s }, [x20]\n"
- "ld1r { v18.4s }, [x19]\n"
- "mov x14, #0x0\n"
- "ldp x13, x12, [x21, #0x0]\n"
- "mov x11, #0x10\n" // cntb _, ALL, #1
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
- "sub x28, XZR, x11\n"
- "lsr x27, %x[n_channels], #0x2\n"
- "cbz x27, 3f\n"
- "ldr q17, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x11, x27, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldp x26, x25, [x16, #0x0]\n"
- "ldp x24, x23, [x16, #0x10]\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "ldr q9, [x26, x14]\n"
- "ldr q10, [x25, x14]\n"
- "ldr q11, [x24, x14]\n"
- "ldr q12, [x23, x14]\n"
- "ldr q13, [x22, x14]\n"
- "ldr q14, [x21, x14]\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "ldr q15, [x20, x14]\n"
- "ldr q16, [x19, x14]\n"
+ "mov x28, #0x0\n"
+ "sub x23, XZR, x26\n"
+ "cbz x25, 3f\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q0, [x24, #0x10]\n"
+ "cmp x26, x25, LSL #4\n"
+ "ldr q1, [x24, #0x20]\n"
+ "ldr q2, [x24, #0x30]\n"
+ "ldr q3, [x24, #0x40]\n"
+ "ldr q4, [x24, #0x50]\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr q7, [x24, #0x80]\n"
+ "ldr q8, [x24, #0x90]\n"
+ "add x24, x24, #0xa0\n"
+ "ldp x22, x20, [x13, #0x0]\n"
+ "ldr q9, [x22, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldr q13, [x22, x28]\n"
+ "ldr q14, [x21, x28]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr q16, [x20, x28]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x40]\n"
- "add x28, x28, #0x10\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x24, [x16, #0x50]\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "ldr x22, [x16, #0x60]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x14]\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "ldr x20, [x16, #0x70]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "ldr q13, [x24, x14]\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
- "ldr q14, [x23, x14]\n"
- "ldr x19, [x16, #0x78]\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "ldr x26, [x16, #0x80]\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "ldr q15, [x22, x14]\n"
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ldr x25, [x16, #0x88]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x21, x14]\n"
- "ldr x24, [x16, #0x90]\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
- "ldr q16, [x20, x14]\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
- "ldr q14, [x25, x14]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "ldr x23, [x16, #0x98]\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "ldr x22, [x16, #0xa0]\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x22, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x22, x28]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x21, x28]\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q14, [x20, x28]\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
"fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x23, x14]\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
- "ldr q13, [x19, x14]\n"
- "ldr x21, [x16, #0xa8]\n"
- "fmla v31.4s, v6.4s, v15.4s\n"
- "ldr q15, [x24, x14]\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "ldr x20, [x16, #0xb0]\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "ldr x19, [x16, #0xb8]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x22, x14]\n"
- "ldr x26, [x16, #0xc0]\n"
- "fmla v31.4s, v7.4s, v16.4s\n"
- "fmla v29.4s, v6.4s, v15.4s\n"
- "ldr q16, [x21, x14]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "ldr q15, [x19, x14]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q17, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "ldr q1, [x15, #0x20]\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v28.4s, v5.4s, v14.4s\n"
- "ldr q14, [x20, x14]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "str q31, [x13, x28]\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
- "ldr q11, [x26, x14]\n"
- "add x14, x14, #0x10\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "ldp x26, x25, [x16, #0x0]\n"
- "ldp x24, x23, [x16, #0x10]\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "fmla v29.4s, v8.4s, v15.4s\n"
- "ldr q9, [x26, x11]\n"
- "ldr q10, [x25, x11]\n"
- "fmla v28.4s, v7.4s, v14.4s\n"
- "ldr q12, [x23, x11]\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "ldr q13, [x22, x11]\n"
- "ldr q14, [x21, x11]\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "str q30, [x12, x28]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "ldr x22, [x13, #0x60]\n"
+ "ldr q15, [x22, x28]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr x22, [x13, #0x80]\n"
+ "ldr q12, [x22, x28]\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q17, [x24, #0x0]\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x20, x28]\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr q11, [x21, x28]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q14, [x20, x28]\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "ldr q0, [x24, #0x10]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x21, [x13, #0x70]\n"
+ "ldr q16, [x21, x28]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "ldr q4, [x24, #0x50]\n"
+ "ldr x20, [x13, #0x98]\n"
"fmla v28.4s, v6.4s, v15.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
- "ldr q11, [x24, x11]\n"
- "ldr q15, [x20, x11]\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "ldr q11, [x20, x28]\n"
+ "ldr q1, [x24, #0x20]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "ldr q2, [x24, #0x30]\n"
+ "ldr x21, [x13, #0x90]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr x21, [x13, #0xa8]\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
"fmax v28.4s, v28.4s, v19.4s\n"
- "ldr q16, [x19, x11]\n"
- "add x11, x11, #0x10\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x22, [x13, #0xa0]\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "ldr q13, [x22, x28]\n"
+ "ldr q3, [x24, #0x40]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "add x23, x23, #0x10\n"
"fmin v28.4s, v28.4s, v18.4s\n"
- "str q29, [x10, x28]\n"
- "cmp x11, x27, LSL #4\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "str q28, [x9, x28]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "blt 1b\n"
- "2:" // Channel tail
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x40]\n"
- "add x28, x28, #0x10\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "ldr x24, [x16, #0x50]\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "ldr x22, [x16, #0x60]\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "ldr q12, [x25, x14]\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "ldr q11, [x26, x14]\n"
- "ldr x20, [x16, #0x70]\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "ldr q13, [x24, x14]\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
- "ldr q14, [x23, x14]\n"
- "ldr x19, [x16, #0x78]\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "ldr x26, [x16, #0x80]\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "ldr q15, [x22, x14]\n"
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ldr x25, [x16, #0x88]\n"
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr q11, [x21, x14]\n"
- "ldr x24, [x16, #0x90]\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
- "ldr q16, [x20, x14]\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
- "ldr q14, [x25, x14]\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q12, [x26, x14]\n"
- "ldr x23, [x16, #0x98]\n"
- "fmla v31.4s, v5.4s, v13.4s\n"
- "ldr x22, [x16, #0xa0]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr q11, [x23, x14]\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
- "ldr q13, [x19, x14]\n"
- "ldr x21, [x16, #0xa8]\n"
+ "ldr q14, [x21, x28]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "ldr q15, [x20, x28]\n"
+ "ldr q7, [x24, #0x80]\n"
"fmla v31.4s, v6.4s, v15.4s\n"
- "ldr q15, [x24, x14]\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "ldr x20, [x16, #0xb0]\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "ldr x19, [x16, #0xb8]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr q13, [x22, x14]\n"
- "ldr x26, [x16, #0xc0]\n"
- "fmla v31.4s, v7.4s, v16.4s\n"
- "fmla v29.4s, v6.4s, v15.4s\n"
- "ldr q16, [x21, x14]\n"
- "fmla v30.4s, v8.4s, v11.4s\n"
- "ldr q15, [x19, x14]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "fmla v29.4s, v7.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr x22, [x13, #0xc0]\n"
"fmax v30.4s, v30.4s, v19.4s\n"
- "fmla v28.4s, v5.4s, v14.4s\n"
- "ldr q14, [x20, x14]\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "str q31, [x13, x28]\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
- "ldr q11, [x26, x14]\n"
- "add x14, x14, #0x10\n"
"fmin v30.4s, v30.4s, v18.4s\n"
- "str q30, [x12, x28]\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
- "fmla v29.4s, v8.4s, v15.4s\n"
- "fmla v28.4s, v7.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
- "str q29, [x10, x28]\n"
+ "ldr q11, [x22, x28]\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "ldr q8, [x24, #0x90]\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "ldp x22, x20, [x13, #0x0]\n"
+ "ldr q9, [x22, x26]\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "add x28, x28, #0x10\n"
+ "ldr q10, [x20, x26]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "str q28, [x12, x23]\n"
+ "add x24, x24, #0xa0\n"
+ "ldr q11, [x21, x26]\n"
+ "ldr q12, [x20, x26]\n"
+ "str q29, [x11, x23]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldr q13, [x22, x26]\n"
+ "str q30, [x10, x23]\n"
+ "ldr q14, [x21, x26]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "str q31, [x9, x23]\n"
+ "ldr q15, [x21, x26]\n"
+ "ldr q16, [x20, x26]\n"
+ "add x26, x26, #0x10\n"
+ "cmp x26, x25, LSL #4\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x22, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x22, x28]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x21, x28]\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q14, [x20, x28]\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr x20, [x13, #0x78]\n"
+ "ldr x22, [x13, #0x60]\n"
+ "ldr q15, [x22, x28]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "ldr x22, [x13, #0x80]\n"
+ "ldr q12, [x22, x28]\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr q11, [x21, x28]\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x20, x28]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q14, [x20, x28]\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x21, [x13, #0x70]\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x20, x28]\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
"fmla v28.4s, v6.4s, v15.4s\n"
- "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr x21, [x13, #0x90]\n"
+ "ldr q15, [x21, x28]\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "ldr x21, [x13, #0xa8]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x22, [x13, #0xa0]\n"
+ "ldr q13, [x22, x28]\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "ldr q14, [x21, x28]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "ldr q15, [x20, x28]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "ldr x22, [x13, #0xc0]\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x22, x28]\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
"fmax v28.4s, v28.4s, v19.4s\n"
+ "add x23, x23, #0x10\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "add x28, x28, #0x10\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
"fmin v28.4s, v28.4s, v18.4s\n"
- "str q28, [x9, x28]\n"
+ "str q28, [x12, x23]\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "str q29, [x11, x23]\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "str q30, [x10, x23]\n"
+ "str q31, [x9, x23]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 42f\n"
- "ldr q17, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "mov x28, x14\n"
- "ldr q1, [x15, #0x20]\n"
- "add x13, x13, x28\n"
- "ldr q2, [x15, #0x30]\n"
- "add x12, x12, x28\n"
- "ldr q3, [x15, #0x40]\n"
- "add x10, x10, x28\n"
- "ldr q4, [x15, #0x50]\n"
- "add x9, x9, x28\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "ldr x26, [x16, #0x0]\n"
- "ldr x25, [x16, #0x8]\n"
- "ldr x24, [x16, #0x10]\n"
- "add x26, x26, x14\n"
- "ldr x23, [x16, #0x18]\n"
- "add x25, x25, x14\n"
- "ldr x22, [x16, #0x20]\n"
- "add x24, x24, x14\n"
- "ldr x21, [x16, #0x28]\n"
- "add x23, x23, x14\n"
- "ldr x20, [x16, #0x30]\n"
- "add x22, x22, x14\n"
- "ldr x19, [x16, #0x38]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
- "add x19, x19, x14\n"
+ "ldr q17, [x24, #0x0]\n"
+ "ldr q0, [x24, #0x10]\n"
+ "mov x23, x28\n"
+ "add x12, x12, x23\n"
+ "ldr q1, [x24, #0x20]\n"
+ "ldr q2, [x24, #0x30]\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "ldr q3, [x24, #0x40]\n"
+ "ldr q4, [x24, #0x50]\n"
+ "add x9, x9, x23\n"
+ "ldr q5, [x24, #0x60]\n"
+ "ldr q6, [x24, #0x70]\n"
+ "ldr q7, [x24, #0x80]\n"
+ "ldr q8, [x24, #0x90]\n"
+ "ldr x27, [x13, #0x0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "add x27, x27, x28\n"
+ "add x26, x26, x28\n"
+ "ldr x25, [x13, #0x10]\n"
+ "ldr x24, [x13, #0x18]\n"
+ "add x25, x25, x28\n"
+ "add x24, x24, x28\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr x22, [x13, #0x28]\n"
+ "add x23, x23, x28\n"
+ "add x22, x22, x28\n"
+ "ldr x21, [x13, #0x30]\n"
+ "ldr x20, [x13, #0x38]\n"
+ "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v9.d }[0], [x26], #0x8\n"
- "ld1 { v10.d }[0], [x25], #0x8\n"
- "ld1 { v11.d }[0], [x24], #0x8\n"
- "ld1 { v12.d }[0], [x23], #0x8\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
- "ld1 { v14.d }[0], [x21], #0x8\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 5f\n"
- "ld1 { v9.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x25], #0x4\n"
- "ld1 { v11.s }[2], [x24], #0x4\n"
- "ld1 { v12.s }[2], [x23], #0x4\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
- "ld1 { v14.s }[2], [x21], #0x4\n"
- "ld1 { v15.s }[2], [x20], #0x4\n"
- "ld1 { v16.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"b 5f\n"
"4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
- "ld1 { v9.s }[0], [x26], #0x4\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
- "ld1 { v11.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v13.s }[0], [x22], #0x4\n"
- "ld1 { v14.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "ld1 { v16.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
- "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
- "ldr x26, [x16, #0x40]\n"
- "add x26, x26, x14\n"
- "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
- "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
- "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v10.4s\n"
- "fmla v30.4s, v1.4s, v12.4s\n"
- "fmla v31.4s, v1.4s, v11.4s\n"
- "fmla v30.4s, v2.4s, v13.4s\n"
- "fmla v31.4s, v3.4s, v14.4s\n"
- "fmla v30.4s, v0.4s, v16.4s\n"
- "fmla v31.4s, v4.4s, v15.4s\n"
- "fmla v31.4s, v2.4s, v16.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x13, #0x40]\n"
+ "add x20, x20, x28\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "mov v31.16b, v17.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load input (1, 3): Bit 1: Unset
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (1, 3): Bit 1: End
- "fmla v30.4s, v4.4s, v11.4s\n"
- "ldr x25, [x16, #0x48]\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v12.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 9f\n"
"8:" // Oddments: Load input (1, 4): Bit 1: Unset
- "ld1 { v12.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (1, 4): Bit 1: End
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr x24, [x16, #0x50]\n"
- "add x24, x24, x14\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Load input (1, 2): Bit 1: Unset
- "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (1, 2): Bit 1: End
- "fmla v31.4s, v5.4s, v13.4s\n"
- "ldr x23, [x16, #0x58]\n"
- "fmla v30.4s, v3.4s, v13.4s\n"
- "add x23, x23, x14\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v14.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v14.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"b 13f\n"
"12:" // Oddments: Load input (3, 0): Bit 1: Unset
- "ld1 { v14.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (3, 0): Bit 1: End
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ldr x22, [x16, #0x60]\n"
- "add x22, x22, x14\n"
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v15.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v15.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 15f\n"
"14:" // Oddments: Load input (2, 0): Bit 1: Unset
- "ld1 { v15.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 0): Bit 1: End
- "fmla v31.4s, v6.4s, v15.4s\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla v29.4s, v0.4s, v15.4s\n"
- "add x21, x21, x14\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: Load input (3, 1): Bit 1: Unset
- "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (3, 1): Bit 1: End
- "fmla v29.4s, v4.4s, v11.4s\n"
- "ldr x20, [x16, #0x70]\n"
- "add x20, x20, x14\n"
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -475,90 +475,90 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v16.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (2, 1): Bit 1: End
- "fmla v31.4s, v7.4s, v16.4s\n"
- "ldr x19, [x16, #0x78]\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "add x19, x19, x14\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 21f\n"
"20:" // Oddments: Load input (3, 3): Bit 1: Unset
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (3, 3): Bit 1: End
- "fmla v28.4s, v4.4s, v13.4s\n"
- "ldr x26, [x16, #0x80]\n"
- "add x26, x26, x14\n"
+ "ldr x20, [x13, #0x80]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v12.d }[0], [x26], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v12.s }[2], [x26], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 23f\n"
"22:" // Oddments: Load input (2, 3): Bit 1: Unset
- "ld1 { v12.s }[0], [x26], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (2, 3): Bit 1: End
- "fmla v30.4s, v7.4s, v12.4s\n"
- "ldr x25, [x16, #0x88]\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "add x25, x25, x14\n"
+ "ldr x20, [x13, #0x88]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v14.d }[0], [x25], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"b 25f\n"
"24:" // Oddments: Load input (3, 4): Bit 1: Unset
- "ld1 { v14.s }[0], [x25], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 4): Bit 1: End
- "fmla v28.4s, v5.4s, v14.4s\n"
- "ldr x24, [x16, #0x90]\n"
- "add x24, x24, x14\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v15.d }[0], [x24], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 27f\n"
"26:" // Oddments: Load input (4, 0): Bit 1: Unset
- "ld1 { v15.s }[0], [x24], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 0): Bit 1: End
- "fmla v29.4s, v6.4s, v15.4s\n"
- "ldr x23, [x16, #0x98]\n"
- "add x23, x23, x14\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
- "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 29f\n"
"28:" // Oddments: Load input (2, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (2, 4): Bit 1: End
- "fmla v30.4s, v8.4s, v11.4s\n"
- "ldr x22, [x16, #0xa0]\n"
- "fmla v28.4s, v2.4s, v11.4s\n"
- "add x22, x22, x14\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 31f\n"
"30:" // Oddments: Load input (4, 1): Bit 1: Unset
- "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (4, 1): Bit 1: End
- "fmla v29.4s, v7.4s, v13.4s\n"
- "ldr x21, [x16, #0xa8]\n"
- "add x21, x21, x14\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v16.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v16.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
"b 33f\n"
"32:" // Oddments: Load input (3, 2): Bit 1: Unset
- "ld1 { v16.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (3, 2): Bit 1: End
- "fmla v29.4s, v5.4s, v16.4s\n"
- "ldr x20, [x16, #0xb0]\n"
- "fmla v28.4s, v3.4s, v16.4s\n"
- "add x20, x20, x14\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -567,61 +567,59 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v14.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (4, 3): Bit 1: End
- "fmla v28.4s, v7.4s, v14.4s\n"
- "ldr x19, [x16, #0xb8]\n"
- "add x19, x19, x14\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
- "ld1 { v15.s }[2], [x19], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 37f\n"
"36:" // Oddments: Load input (4, 2): Bit 1: Unset
- "ld1 { v15.s }[0], [x19], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (4, 2): Bit 1: End
- "fmla v29.4s, v8.4s, v15.4s\n"
- "ldr x26, [x16, #0xc0]\n"
- "fmla v28.4s, v6.4s, v15.4s\n"
- "add x26, x26, x14\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "add x20, x20, x28\n"
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v11.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v11.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 39f\n"
"38:" // Oddments: Load input (4, 4): Bit 1: Unset
- "ld1 { v11.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (4, 4): Bit 1: End
- "fmla v28.4s, v8.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v19.4s\n"
- "fmax v30.4s, v30.4s, v19.4s\n"
- "fmax v29.4s, v29.4s, v19.4s\n"
- "fmin v31.4s, v31.4s, v18.4s\n"
- "fmin v30.4s, v30.4s, v18.4s\n"
- "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
"fmax v28.4s, v28.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
"fmin v28.4s, v28.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
"tbz %x[n_channels], #1, 40f\n"
- "st1 { v31.d }[0], [x13], #0x8\n"
- "st1 { v30.d }[0], [x12], #0x8\n"
- "st1 { v29.d }[0], [x10], #0x8\n"
- "st1 { v28.d }[0], [x9], #0x8\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
- "st1 { v31.s }[2], [x13], #0x4\n"
- "st1 { v30.s }[2], [x12], #0x4\n"
- "st1 { v29.s }[2], [x10], #0x4\n"
- "st1 { v28.s }[2], [x9], #0x4\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
"b 41f\n"
"40:" // Oddments: Store: Bit 1: Unset
- "st1 { v31.s }[0], [x13], #0x4\n"
- "st1 { v30.s }[0], [x12], #0x4\n"
- "st1 { v29.s }[0], [x10], #0x4\n"
- "st1 { v28.s }[0], [x9], #0x4\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
"41:" // Oddments: Store: Bit 1: End
-
"42:" // End
-
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 8b030ecc8b..6ca3976f02 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,403 +87,403 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x27, #0x0\n"
"mov x26, #0x0\n"
- "mov x25, #0x0\n"
"1:" // Tile loop
- "str x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x22, #0x2\n"
- "mov x21, #0x2\n"
- "str x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x2\n"
+ "mov x25, #0x2\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x20, x26, x24\n" // offset = tile_i * ld_input_row
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x20, x25, x3, x20\n" // offset += tile_j * ld_input_col
- "ldr x4, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x3, x3, #0x2\n"
- "mul x19, x26, x23\n" // offset = tile_i * ld_output_row
- "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x7, x3, x3\n"
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x8, x5, x24, LSL #2\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x19, x25, x4, x19\n" // offset += tile_j * ld_output_col
- "add x16, x8, x24, LSL #2\n"
- "mov x22, #0x10\n" // cntb _, ALL, #1
- "mul x19, x19, x21\n" // offset *= output_tile_size
- "lsr x21, %x[n_channels], #0x2\n"
- "add x15, x16, x24, LSL #2\n"
- "add x14, x7, x3\n"
- "add x13, x15, x24, LSL #2\n"
- "add x12, x14, x3\n"
- "add x6, x6, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x2, x2, #0x2\n"
+ "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x6, x2, x2\n"
+ "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
+ "add x4, x4, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x7, x4, x24, LSL #2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
+ "add x17, x7, x24, LSL #2\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x16, x17, x24, LSL #2\n"
+ "add x15, x6, x2\n"
+ "add x14, x16, x24, LSL #2\n"
+ "add x13, x15, x2\n"
+ "add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
"ld1r { v18.4s }, [x20]\n"
- "ld1r { v17.4s }, [x19]\n"
- "add x11, x13, x24, LSL #2\n"
- "add x10, x12, x3\n"
- "add x9, x6, x23, LSL #2\n"
- "lsl x4, x4, #0x2\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
- "cbz x21, 4f\n"
- "ldr q16, [x17, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
- "ldr q0, [x17, #0x10]\n"
- "ldr q1, [x17, #0x20]\n"
- "ldr q2, [x17, #0x30]\n"
- "ldr q3, [x17, #0x40]\n"
- "ldr q4, [x17, #0x50]\n"
- "ld1 { v5.4s }, [x5]\n"
- "add x17, x17, #0x60\n"
- "ldr q6, [x5, x3]\n"
- "ld1 { v7.4s }, [x8]\n"
- "ldr q8, [x8, x3]\n"
- "ldr q9, [x5, x7]\n"
- "ldr q13, [x8, x7]\n"
- "ldr q11, [x5, x14]\n"
- "ldr q12, [x5, x12]\n"
- "ldr q10, [x8, x10]\n"
- "ld1 { v14.4s }, [x16]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x13, x2\n"
+ "add x10, x5, x21, LSL #2\n"
+ "lsl x3, x3, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q16, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x8, x8, #0x60\n"
+ "ld1 { v5.4s }, [x4]\n"
+ "ldr q6, [x4, x2]\n"
+ "ld1 { v7.4s }, [x7]\n"
+ "ldr q8, [x7, x2]\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q13, [x7, x6]\n"
+ "ldr q11, [x4, x15]\n"
+ "ldr q12, [x4, x13]\n"
+ "ldr q10, [x7, x11]\n"
+ "ld1 { v14.4s }, [x17]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x7, x15]\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x8, x14]\n"
- "add x22, x22, #0x10\n"
+ "add x23, x23, #0x10\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x17, #0x0]\n"
- "cmp x22, x21, LSL #4\n"
+ "ldr q0, [x8, #0x0]\n"
+ "ldr q16, [x8, #0x140]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x7, x13]\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q6, [x8, x12]\n"
- "add x8, x8, #0x10\n"
+ "add x7, x7, #0x10\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x17, #0x10]\n"
- "add x19, x19, #0x10\n"
+ "ldr q1, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
"fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x4, x11]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q9, [x5, x10]\n"
- "add x5, x5, #0x10\n"
+ "add x4, x4, #0x10\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x17, #0x20]\n"
+ "ldr q2, [x8, #0x20]\n"
"add x20, x20, #0x10\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x17, x2]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x16, x3]\n"
- "ldr q16, [x17, #0x140]\n"
+ "add x21, x21, #0x10\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x17, #0x30]\n"
+ "ldr q3, [x8, #0x30]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x17, x6]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q12, [x16, x7]\n"
- "ldr q9, [x16, x14]\n"
+ "ldr q9, [x17, x15]\n"
"fmla v30.4s, v4.4s, v6.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x17, #0x40]\n"
+ "ldr q4, [x8, #0x40]\n"
"fmla v28.4s, v0.4s, v7.4s\n"
+ "ld1 { v7.4s }, [x7]\n"
"fmla v29.4s, v0.4s, v8.4s\n"
- "ld1 { v7.4s }, [x8]\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x17, #0x50]\n"
+ "ldr q0, [x8, #0x50]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x17, x11]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q8, [x16, x10]\n"
"fmla v30.4s, v1.4s, v11.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x17, #0x60]\n"
+ "ldr q1, [x8, #0x60]\n"
"fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x17, x13]\n"
"fmla v29.4s, v2.4s, v5.4s\n"
- "ldr q13, [x16, x12]\n"
- "add x16, x16, #0x10\n"
+ "add x17, x17, #0x10\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x17, #0x70]\n"
+ "ldr q2, [x8, #0x70]\n"
"fmla v28.4s, v3.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x16]\n"
"fmla v29.4s, v3.4s, v6.4s\n"
- "ld1 { v5.4s }, [x15]\n"
"fmla v30.4s, v3.4s, v9.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x17, #0x80]\n"
+ "ldr q3, [x8, #0x80]\n"
"fmla v28.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x16, x2]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q6, [x15, x3]\n"
- "ldr q10, [x15, x7]\n"
+ "ldr q10, [x16, x6]\n"
"fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x17, #0x90]\n"
+ "ldr q4, [x8, #0x90]\n"
"fmla v28.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x16, x11]\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q14, [x15, x10]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
"fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x17, #0xa0]\n"
+ "ldr q0, [x8, #0xa0]\n"
"fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x15]\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q11, [x15, x14]\n"
"fmla v30.4s, v1.4s, v6.4s\n"
"fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x17, #0xb0]\n"
+ "ldr q1, [x8, #0xb0]\n"
"fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x16, x13]\n"
"fmla v29.4s, v2.4s, v9.4s\n"
- "ldr q12, [x15, x12]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v30.4s, v2.4s, v10.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x17, #0xc0]\n"
+ "ldr q2, [x8, #0xc0]\n"
"fmla v28.4s, v3.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x14]\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "ld1 { v9.4s }, [x13]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x17, #0xd0]\n"
+ "ldr q3, [x8, #0xd0]\n"
"fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x14, x2]\n"
"fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q13, [x13, x3]\n"
- "ldr q8, [x13, x12]\n"
+ "ldr q8, [x14, x13]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x17, #0xe0]\n"
+ "ldr q4, [x8, #0xe0]\n"
"fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x14, x6]\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x13, x7]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x17, #0xf0]\n"
+ "ldr q0, [x8, #0xf0]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x14, x15]\n"
"fmla v29.4s, v1.4s, v10.4s\n"
- "ldr q6, [x13, x14]\n"
"fmla v30.4s, v1.4s, v13.4s\n"
"fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x17, #0x100]\n"
+ "ldr q1, [x8, #0x100]\n"
"fmla v28.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x14, x11]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q10, [x13, x10]\n"
- "add x13, x13, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v30.4s, v2.4s, v5.4s\n"
"fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x17, #0x110]\n"
+ "ldr q2, [x8, #0x110]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ld1 { v11.4s }, [x11]\n"
"fmla v30.4s, v3.4s, v6.4s\n"
"fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x17, #0x120]\n"
+ "ldr q3, [x8, #0x120]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x12, x2]\n"
"fmla v29.4s, v4.4s, v14.4s\n"
- "ldr q12, [x11, x3]\n"
- "ld1 { v14.4s }, [x16]\n"
+ "ld1 { v14.4s }, [x17]\n"
"fmla v30.4s, v4.4s, v8.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x17, #0x130]\n"
+ "ldr q4, [x8, #0x130]\n"
"fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x12, x6]\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "ldr q9, [x11, x7]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x15]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
- "ldr q11, [x11, x14]\n"
- "ldr q0, [x17, #0x150]\n"
+ "ldr q0, [x8, #0x150]\n"
"fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x7, x6]\n"
"fmla v29.4s, v1.4s, v5.4s\n"
- "ldr q13, [x8, x7]\n"
"fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x12, x13]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "ldr q12, [x11, x12]\n"
- "ldr q1, [x17, #0x160]\n"
+ "ldr q1, [x8, #0x160]\n"
"fmla v28.4s, v2.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x4]\n"
"fmla v29.4s, v2.4s, v6.4s\n"
- "ld1 { v5.4s }, [x5]\n"
"fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x12, x11]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q9, [x11, x10]\n"
- "add x11, x11, #0x10\n"
+ "ldr q2, [x8, #0x170]\n"
"fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q6, [x4, x2]\n"
"fmla v29.4s, v3.4s, v8.4s\n"
- "ldr q6, [x5, x3]\n"
- "ldr q2, [x17, #0x170]\n"
+ "add x12, x12, #0x10\n"
"fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x4, x15]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q11, [x5, x14]\n"
- "ldr q3, [x17, #0x180]\n"
+ "ldr q3, [x8, #0x180]\n"
"fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x7, x2]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "ldr q8, [x8, x3]\n"
+ "ldr q10, [x7, x11]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x4, x13]\n"
"fmla v31.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q4, [x8, #0x190]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
"fmax v29.4s, v29.4s, v18.4s\n"
- "ldr q9, [x5, x7]\n"
+ "add x8, x8, #0x1a0\n"
"fmax v30.4s, v30.4s, v18.4s\n"
"fmax v31.4s, v31.4s, v18.4s\n"
- "ldr q12, [x5, x12]\n"
- "ldr q10, [x8, x10]\n"
"fmin v28.4s, v28.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x6]\n"
- "ldr q4, [x17, #0x190]\n"
+ "st1 { v28.4s }, [x5]\n"
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x6, x4]\n"
- "add x6, x6, #0x10\n"
- "st1 { v30.4s }, [x9]\n"
- "add x17, x17, #0x1a0\n"
- "str q31, [x9, x4]\n"
- "add x9, x9, #0x10\n"
+ "str q29, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.4s }, [x10]\n"
+ "str q31, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x7, x15]\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x8, x14]\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x7, x13]\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q6, [x8, x12]\n"
- "add x8, x8, #0x10\n"
+ "add x7, x7, #0x10\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q1, [x17, #0x10]\n"
+ "ldr q1, [x8, #0x10]\n"
"fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x4, x11]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q9, [x5, x10]\n"
- "add x5, x5, #0x10\n"
+ "add x4, x4, #0x10\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v31.4s, v2.4s, v5.4s\n"
- "ldr q2, [x17, #0x20]\n"
+ "ldr q2, [x8, #0x20]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x17, x2]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x16, x3]\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"fmla v31.4s, v3.4s, v6.4s\n"
- "ldr q3, [x17, #0x30]\n"
+ "ldr q3, [x8, #0x30]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x17, x6]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q12, [x16, x7]\n"
- "ldr q9, [x16, x14]\n"
+ "ldr q9, [x17, x15]\n"
"fmla v30.4s, v4.4s, v6.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x17, #0x40]\n"
+ "ldr q4, [x8, #0x40]\n"
"fmla v28.4s, v0.4s, v7.4s\n"
"fmla v29.4s, v0.4s, v8.4s\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"fmla v31.4s, v0.4s, v11.4s\n"
- "ldr q0, [x17, #0x50]\n"
+ "ldr q0, [x8, #0x50]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x17, x11]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q8, [x16, x10]\n"
"fmla v30.4s, v1.4s, v11.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "ldr q1, [x17, #0x60]\n"
+ "ldr q1, [x8, #0x60]\n"
"fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x17, x13]\n"
"fmla v29.4s, v2.4s, v5.4s\n"
- "ldr q13, [x16, x12]\n"
- "add x16, x16, #0x10\n"
+ "add x17, x17, #0x10\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q2, [x17, #0x70]\n"
+ "ldr q2, [x8, #0x70]\n"
"fmla v28.4s, v3.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x16]\n"
"fmla v29.4s, v3.4s, v6.4s\n"
- "ld1 { v5.4s }, [x15]\n"
"fmla v30.4s, v3.4s, v9.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
- "ldr q3, [x17, #0x80]\n"
+ "ldr q3, [x8, #0x80]\n"
"fmla v28.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x16, x2]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q6, [x15, x3]\n"
- "ldr q10, [x15, x7]\n"
+ "ldr q10, [x16, x6]\n"
"fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v8.4s\n"
- "ldr q4, [x17, #0x90]\n"
+ "ldr q4, [x8, #0x90]\n"
"fmla v28.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x16, x11]\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q14, [x15, x10]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
"fmla v31.4s, v0.4s, v6.4s\n"
- "ldr q0, [x17, #0xa0]\n"
+ "ldr q0, [x8, #0xa0]\n"
"fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x15]\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q11, [x15, x14]\n"
"fmla v30.4s, v1.4s, v6.4s\n"
"fmla v31.4s, v1.4s, v10.4s\n"
- "ldr q1, [x17, #0xb0]\n"
+ "ldr q1, [x8, #0xb0]\n"
"fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x16, x13]\n"
"fmla v29.4s, v2.4s, v9.4s\n"
- "ldr q12, [x15, x12]\n"
- "add x15, x15, #0x10\n"
+ "add x16, x16, #0x10\n"
"fmla v30.4s, v2.4s, v10.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q2, [x17, #0xc0]\n"
+ "ldr q2, [x8, #0xc0]\n"
"fmla v28.4s, v3.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x14]\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "ld1 { v9.4s }, [x13]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr q3, [x17, #0xd0]\n"
+ "ldr q3, [x8, #0xd0]\n"
"fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x14, x2]\n"
"fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q13, [x13, x3]\n"
- "ldr q8, [x13, x12]\n"
+ "ldr q8, [x14, x13]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"fmla v31.4s, v4.4s, v14.4s\n"
- "ldr q4, [x17, #0xe0]\n"
+ "ldr q4, [x8, #0xe0]\n"
"fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x14, x6]\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x13, x7]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v13.4s\n"
- "ldr q0, [x17, #0xf0]\n"
+ "ldr q0, [x8, #0xf0]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x14, x15]\n"
"fmla v29.4s, v1.4s, v10.4s\n"
- "ldr q6, [x13, x14]\n"
"fmla v30.4s, v1.4s, v13.4s\n"
"fmla v31.4s, v1.4s, v5.4s\n"
- "ldr q1, [x17, #0x100]\n"
+ "ldr q1, [x8, #0x100]\n"
"fmla v28.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x14, x11]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q10, [x13, x10]\n"
- "add x13, x13, #0x10\n"
+ "add x14, x14, #0x10\n"
"fmla v30.4s, v2.4s, v5.4s\n"
"fmla v31.4s, v2.4s, v6.4s\n"
- "ldr q2, [x17, #0x110]\n"
+ "ldr q2, [x8, #0x110]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ld1 { v11.4s }, [x11]\n"
"fmla v30.4s, v3.4s, v6.4s\n"
"fmla v31.4s, v3.4s, v8.4s\n"
- "ldr q3, [x17, #0x120]\n"
+ "ldr q3, [x8, #0x120]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x12, x2]\n"
"fmla v29.4s, v4.4s, v14.4s\n"
- "ldr q12, [x11, x3]\n"
"fmla v30.4s, v4.4s, v8.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "ldr q4, [x17, #0x130]\n"
- "add x17, x17, #0x140\n"
+ "ldr q4, [x8, #0x130]\n"
+ "add x8, x8, #0x140\n"
"fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x12, x6]\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "ldr q9, [x11, x7]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x12, x15]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
- "ldr q11, [x11, x14]\n"
"fmla v28.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x12, x13]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "ldr q12, [x11, x12]\n"
"fmla v28.4s, v2.4s, v5.4s\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x12, x11]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q9, [x11, x10]\n"
- "add x11, x11, #0x10\n"
+ "add x12, x12, #0x10\n"
"fmla v28.4s, v3.4s, v6.4s\n"
"fmla v29.4s, v3.4s, v8.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
@@ -498,72 +498,72 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v17.4s\n"
- "st1 { v28.4s }, [x6]\n"
+ "st1 { v28.4s }, [x5]\n"
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x6, x4]\n"
- "add x6, x6, #0x10\n"
- "st1 { v30.4s }, [x9]\n"
- "str q31, [x9, x4]\n"
- "add x9, x9, #0x10\n"
+ "str q29, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.4s }, [x10]\n"
+ "str q31, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 61f\n"
- "ldr q16, [x17, #0x0]\n"
- "ldr q0, [x17, #0x10]\n"
- "ldr q1, [x17, #0x20]\n"
- "ldr q2, [x17, #0x30]\n"
- "add x28, x5, XZR\n"
- "add x27, x5, x3\n"
- "ldr q3, [x17, #0x40]\n"
- "ldr q4, [x17, #0x50]\n"
- "add x26, x8, XZR\n"
- "add x25, x8, x3\n"
- "add x24, x5, x7\n"
- "add x23, x8, x7\n"
- "add x22, x5, x14\n"
- "add x21, x5, x12\n"
- "add x20, x8, x10\n"
- "add x19, x16, XZR\n"
- "add x17, x17, #0x60\n"
+ "ldr q16, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "add x9, x4, XZR\n"
+ "add x28, x4, x2\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "add x27, x7, XZR\n"
+ "add x26, x7, x2\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x25, x4, x6\n"
+ "add x24, x7, x6\n"
+ "add x23, x4, x15\n"
+ "add x22, x4, x13\n"
+ "add x21, x7, x11\n"
+ "add x20, x17, XZR\n"
+ "add x8, x8, #0x60\n"
"tbz %x[n_channels], #1, 5f\n"
- "ldr d5, [x28], #0x8\n"
- "ldr d6, [x27], #0x8\n"
- "ldr d7, [x26], #0x8\n"
- "ldr d8, [x25], #0x8\n"
- "ldr d9, [x24], #0x8\n"
- "ldr d13, [x23], #0x8\n"
- "ldr d11, [x22], #0x8\n"
- "ldr d12, [x21], #0x8\n"
- "ldr d10, [x20], #0x8\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d5, [x9], #0x8\n"
+ "ldr d6, [x28], #0x8\n"
+ "ldr d7, [x27], #0x8\n"
+ "ldr d8, [x26], #0x8\n"
+ "ldr d9, [x25], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d11, [x23], #0x8\n"
+ "ldr d12, [x22], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v5.s }[2], [x28]\n"
- "ld1 { v6.s }[2], [x27]\n"
- "ld1 { v7.s }[2], [x26]\n"
- "ld1 { v8.s }[2], [x25]\n"
- "ld1 { v9.s }[2], [x24]\n"
- "ld1 { v13.s }[2], [x23]\n"
- "ld1 { v11.s }[2], [x22]\n"
- "ld1 { v12.s }[2], [x21]\n"
- "ld1 { v10.s }[2], [x20]\n"
- "ld1 { v14.s }[2], [x19]\n"
+ "ld1 { v5.s }[2], [x9]\n"
+ "ld1 { v6.s }[2], [x28]\n"
+ "ld1 { v7.s }[2], [x27]\n"
+ "ld1 { v8.s }[2], [x26]\n"
+ "ld1 { v9.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v11.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x22]\n"
+ "ld1 { v10.s }[2], [x21]\n"
+ "ld1 { v14.s }[2], [x20]\n"
"b 6f\n"
"5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
- "ldr s5, [x28, #0x0]\n"
- "ldr s6, [x27, #0x0]\n"
- "ldr s7, [x26, #0x0]\n"
- "ldr s8, [x25, #0x0]\n"
- "ldr s9, [x24, #0x0]\n"
- "ldr s13, [x23, #0x0]\n"
- "ldr s11, [x22, #0x0]\n"
- "ldr s12, [x21, #0x0]\n"
- "ldr s10, [x20, #0x0]\n"
- "ldr s14, [x19, #0x0]\n"
+ "ldr s5, [x9, #0x0]\n"
+ "ldr s6, [x28, #0x0]\n"
+ "ldr s7, [x27, #0x0]\n"
+ "ldr s8, [x26, #0x0]\n"
+ "ldr s9, [x25, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s11, [x23, #0x0]\n"
+ "ldr s12, [x22, #0x0]\n"
+ "ldr s10, [x21, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "add x19, x8, x14\n"
+ "add x20, x7, x15\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
@@ -574,364 +574,364 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"tbz %x[n_channels], #1, 7f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v5.s }[2], [x19]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 8f\n"
"7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
- "ldr s5, [x19, #0x0]\n"
+ "ldr s5, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v31.4s, v2.4s, v5.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x19, x8, x12\n"
+ "add x20, x7, x13\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v6.s }[2], [x19]\n"
+ "ld1 { v6.s }[2], [x20]\n"
"b 10f\n"
"9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
- "ldr s6, [x19, #0x0]\n"
+ "ldr s6, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
"fmla v31.4s, v3.4s, v6.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x19, x5, x10\n"
+ "add x20, x4, x11\n"
"tbz %x[n_channels], #1, 11f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 12f\n"
"11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+ "ldr q0, [x8, #0x0]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
"fmla v30.4s, v4.4s, v6.4s\n"
- "ldr q0, [x17, #0x0]\n"
- "add x19, x16, x3\n"
+ "add x20, x17, x2\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"fmla v28.4s, v0.4s, v7.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"fmla v29.4s, v0.4s, v8.4s\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 14f\n"
"13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.4s, v0.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v8.4s\n"
- "add x19, x16, x7\n"
+ "add x20, x17, x6\n"
"fmla v29.4s, v1.4s, v13.4s\n"
"fmla v30.4s, v1.4s, v11.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 15f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 16f\n"
"15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "add x19, x16, x14\n"
+ "add x20, x17, x15\n"
"fmla v29.4s, v2.4s, v5.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 17f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 18f\n"
"17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
"fmla v28.4s, v3.4s, v5.4s\n"
- "add x19, x16, x12\n"
+ "add x20, x17, x13\n"
"fmla v29.4s, v3.4s, v6.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 19f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 20f\n"
"19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.4s, v3.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v6.4s\n"
- "add x19, x16, x10\n"
+ "add x20, x17, x11\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v13.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 21f\n"
- "ldr d8, [x19], #0x8\n"
+ "ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
- "ld1 { v8.s }[2], [x19]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 22f\n"
"21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
- "ldr s8, [x19, #0x0]\n"
+ "ldr s8, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.4s, v4.4s, v8.4s\n"
"fmla v28.4s, v0.4s, v14.4s\n"
- "add x19, x15, XZR\n"
+ "add x20, x16, XZR\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 23f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "ld1 { v5.s }[2], [x19]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 24f\n"
"23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
- "ldr s5, [x19, #0x0]\n"
+ "ldr s5, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v30.4s, v0.4s, v5.4s\n"
- "add x19, x15, x3\n"
+ "add x20, x16, x2\n"
"tbz %x[n_channels], #1, 25f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
- "ld1 { v6.s }[2], [x19]\n"
+ "ld1 { v6.s }[2], [x20]\n"
"b 26f\n"
"25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
- "ldr s6, [x19, #0x0]\n"
+ "ldr s6, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.4s, v0.4s, v6.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
- "add x19, x15, x7\n"
+ "add x20, x16, x6\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v30.4s, v1.4s, v6.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 27f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 28f\n"
"27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.4s, v1.4s, v10.4s\n"
"fmla v28.4s, v2.4s, v12.4s\n"
- "add x19, x15, x14\n"
+ "add x20, x16, x15\n"
"fmla v29.4s, v2.4s, v9.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 29f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 30f\n"
"29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v9.4s\n"
- "add x19, x15, x12\n"
+ "add x20, x16, x13\n"
"fmla v29.4s, v3.4s, v13.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 31f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 32f\n"
"31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"fmla v28.4s, v4.4s, v13.4s\n"
- "add x19, x15, x10\n"
+ "add x20, x16, x11\n"
"fmla v29.4s, v4.4s, v8.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 33f\n"
- "ldr d14, [x19], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
- "ld1 { v14.s }[2], [x19]\n"
+ "ld1 { v14.s }[2], [x20]\n"
"b 34f\n"
"33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
- "ldr s14, [x19, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
"34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.4s, v4.4s, v14.4s\n"
"fmla v28.4s, v0.4s, v5.4s\n"
- "add x19, x13, XZR\n"
+ "add x20, x14, XZR\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 35f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 36f\n"
"35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
"fmla v30.4s, v0.4s, v9.4s\n"
- "add x19, x13, x3\n"
+ "add x20, x14, x2\n"
"tbz %x[n_channels], #1, 37f\n"
- "ldr d13, [x19], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x20]\n"
"b 38f\n"
"37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
- "ldr s13, [x19, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
"38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.4s, v0.4s, v13.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
- "add x19, x13, x7\n"
+ "add x20, x14, x6\n"
"fmla v29.4s, v1.4s, v10.4s\n"
"fmla v30.4s, v1.4s, v13.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 39f\n"
- "ldr d5, [x19], #0x8\n"
+ "ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
- "ld1 { v5.s }[2], [x19]\n"
+ "ld1 { v5.s }[2], [x20]\n"
"b 40f\n"
"39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
- "ldr s5, [x19, #0x0]\n"
+ "ldr s5, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
"fmla v28.4s, v2.4s, v10.4s\n"
- "add x19, x13, x14\n"
+ "add x20, x14, x15\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v5.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 41f\n"
- "ldr d6, [x19], #0x8\n"
+ "ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #0, 42f\n"
- "ld1 { v6.s }[2], [x19]\n"
+ "ld1 { v6.s }[2], [x20]\n"
"b 42f\n"
"41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
- "ldr s6, [x19, #0x0]\n"
+ "ldr s6, [x20, #0x0]\n"
"42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.4s, v2.4s, v6.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x19, x13, x12\n"
+ "add x20, x14, x13\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v6.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 43f\n"
- "ldr d8, [x19], #0x8\n"
+ "ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #0, 44f\n"
- "ld1 { v8.s }[2], [x19]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 44f\n"
"43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
- "ldr s8, [x19, #0x0]\n"
+ "ldr s8, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.4s, v3.4s, v8.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x19, x13, x10\n"
+ "add x20, x14, x11\n"
"fmla v29.4s, v4.4s, v14.4s\n"
"fmla v30.4s, v4.4s, v8.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 45f\n"
- "ldr d10, [x19], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 46f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ld1 { v10.s }[2], [x20]\n"
"b 46f\n"
"45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
- "ldr s10, [x19, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
"46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
- "ldr q0, [x17, #0x0]\n"
+ "ldr q0, [x8, #0x0]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"fmla v28.4s, v0.4s, v9.4s\n"
- "add x19, x11, XZR\n"
+ "add x20, x12, XZR\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 47f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 48f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x19, x11, x3\n"
+ "add x20, x12, x2\n"
"tbz %x[n_channels], #1, 49f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 50f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 50f\n"
"49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
- "ldr q1, [x17, #0x0]\n"
+ "ldr q1, [x8, #0x0]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
"fmla v28.4s, v1.4s, v13.4s\n"
- "add x19, x11, x7\n"
+ "add x20, x12, x6\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 51f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 52f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 52f\n"
"51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
- "ldr q2, [x17, #0x0]\n"
+ "ldr q2, [x8, #0x0]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v5.4s\n"
- "add x19, x11, x14\n"
+ "add x20, x12, x15\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 53f\n"
- "ldr d11, [x19], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 54f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x20]\n"
"b 54f\n"
"53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
- "ldr s11, [x19, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
"54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
- "ldr q3, [x17, #0x0]\n"
+ "ldr q3, [x8, #0x0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v6.4s\n"
- "add x19, x11, x12\n"
+ "add x20, x12, x13\n"
"fmla v29.4s, v3.4s, v8.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x17, x17, #0x10\n"
+ "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 55f\n"
- "ldr d12, [x19], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 56f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "ld1 { v12.s }[2], [x20]\n"
"b 56f\n"
"55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
- "ldr s12, [x19, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
- "ldr q4, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"fmla v28.4s, v4.4s, v8.4s\n"
- "add x19, x11, x10\n"
+ "add x20, x12, x11\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"tbz %x[n_channels], #1, 57f\n"
- "ldr d9, [x19], #0x8\n"
+ "ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 58f\n"
- "ld1 { v9.s }[2], [x19]\n"
+ "ld1 { v9.s }[2], [x20]\n"
"b 58f\n"
"57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
- "ldr s9, [x19, #0x0]\n"
+ "ldr s9, [x20, #0x0]\n"
"58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
@@ -943,46 +943,46 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 59f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.d }[0], [x20], x4\n"
- "add x6, x6, #0x8\n"
- "add x9, x9, #0x8\n"
- "st1 { v30.d }[0], [x19], x4\n"
- "st1 { v29.d }[0], [x20]\n"
- "st1 { v31.d }[0], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
+ "add x5, x5, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 60f\n"
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.s }[2], [x20], x4\n"
- "st1 { v30.s }[2], [x19], x4\n"
- "st1 { v29.s }[2], [x20]\n"
- "st1 { v31.s }[2], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[2], [x21], x3\n"
+ "st1 { v30.s }[2], [x20], x3\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
"b 60f\n"
"59:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x20, x6\n"
- "mov x19, x9\n"
- "st1 { v28.s }[0], [x20], x4\n"
- "st1 { v30.s }[0], [x19], x4\n"
- "st1 { v29.s }[0], [x20]\n"
- "st1 { v31.s }[0], [x19]\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[0], [x21], x3\n"
+ "st1 { v30.s }[0], [x20], x3\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
"60:" // Tile loop: Oddments: Store: Bit 1: End
"61:" // Tile loop: End
- "ldr x25, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x25, x25, #0x1\n"
- "add x20, x26, #0x1\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x25, x19\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x26, x26, x20, LT\n"
- "csel x25, x25, XZR, LT\n"
- "cmp x26, x19\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 4754a6f6f1..860adac326 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,422 +99,422 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x28, #0x10\n" // cntb _, ALL, #1
- "lsr x27, %x[n_channels], #0x2\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x9, %x[n_channels], #0x2\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "add x19, %x[params_struct], %[offsetof_args_max]\n"
- "ldp x15, x14, [x21, #0x0]\n"
- "ldp x13, x12, [x21, #0x10]\n"
- "add x11, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ld1r { v18.4s }, [x20]\n"
- "ld1r { v17.4s }, [x19]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
"mov x10, #0x0\n"
- "sub x9, XZR, x28\n"
- "cbz x27, 3f\n"
- "ldp x26, x25, [x11, #0x0]\n"
- "ldr q5, [x26, x10]\n"
- "ldr q6, [x25, x10]\n"
- "ldp x24, x23, [x11, #0x10]\n"
- "cmp x28, x27, LSL #4\n"
- "ldp x22, x21, [x11, #0x20]\n"
- "ldp x20, x19, [x11, #0x30]\n"
- "ldp x26, x25, [x11, #0x40]\n"
+ "sub x28, XZR, x17\n"
+ "cbz x9, 3f\n"
"ldr q16, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
+ "cmp x17, x9, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "ldr q7, [x24, x10]\n"
"add x16, x16, #0x60\n"
- "ldr q8, [x23, x10]\n"
- "ldr q9, [x22, x10]\n"
- "ldr q13, [x21, x10]\n"
- "ldr q11, [x20, x10]\n"
- "ldr q12, [x19, x10]\n"
- "ldr q10, [x26, x10]\n"
- "ldr q14, [x25, x10]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldr q5, [x27, x10]\n"
+ "ldr q6, [x26, x10]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr q7, [x25, x10]\n"
+ "ldr q8, [x24, x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldr q9, [x23, x10]\n"
+ "ldr q13, [x22, x10]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q11, [x21, x10]\n"
+ "ldr q12, [x20, x10]\n"
+ "ldp x27, x26, [x15, #0x40]\n"
+ "ldr q10, [x27, x10]\n"
+ "ldr q14, [x26, x10]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x24, [x11, #0x50]\n"
- "ldr q5, [x24, x10]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr q5, [x25, x10]\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr x23, [x11, #0x58]\n"
- "ldr x22, [x11, #0x60]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q16, [x16, #0x140]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr x21, [x11, #0x68]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr q6, [x24, x10]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x20, [x11, #0x70]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x23, x10]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q9, [x22, x10]\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v31.4s, v2.4s, v5.4s\n"
- "ldr x19, [x11, #0x78]\n"
"ldr q2, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x21, x10]\n"
- "ldr x26, [x11, #0x80]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"fmla v31.4s, v3.4s, v6.4s\n"
"ldr q3, [x16, #0x30]\n"
- "ldr x25, [x11, #0x88]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q12, [x20, x10]\n"
- "ldr q9, [x19, x10]\n"
+ "ldr q9, [x20, x10]\n"
"fmla v30.4s, v4.4s, v6.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"ldr q4, [x16, #0x40]\n"
- "ldr x24, [x11, #0x90]\n"
+ "ldr x26, [x15, #0x88]\n"
"fmla v28.4s, v0.4s, v7.4s\n"
"fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x23, [x11, #0x98]\n"
- "ldr x22, [x11, #0xa0]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x24, [x15, #0x98]\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"fmla v31.4s, v0.4s, v11.4s\n"
"ldr q0, [x16, #0x50]\n"
- "ldr x21, [x11, #0xa8]\n"
+ "ldr x23, [x15, #0xa0]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q8, [x25, x10]\n"
- "ldr x20, [x11, #0xb0]\n"
+ "ldr x22, [x15, #0xa8]\n"
"fmla v30.4s, v1.4s, v11.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
"ldr q1, [x16, #0x60]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x21, [x15, #0xb0]\n"
"fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x27, x10]\n"
"fmla v29.4s, v2.4s, v5.4s\n"
- "ldr q13, [x26, x10]\n"
- "ldr x26, [x11, #0xc0]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v9.4s\n"
"ldr q2, [x16, #0x70]\n"
- "ldr x25, [x11, #0xc8]\n"
+ "ldr x27, [x15, #0xc0]\n"
"fmla v28.4s, v3.4s, v5.4s\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.4s, v3.4s, v6.4s\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0xd0]\n"
+ "ldr x26, [x15, #0xc8]\n"
"fmla v30.4s, v3.4s, v9.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
"ldr q3, [x16, #0x80]\n"
- "add x9, x9, #0x10\n"
+ "ldr x25, [x15, #0xd0]\n"
"fmla v28.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr q10, [x22, x10]\n"
+ "ldr q10, [x23, x10]\n"
"fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v8.4s\n"
"ldr q4, [x16, #0x90]\n"
- "ldr x23, [x11, #0xd8]\n"
+ "ldr x24, [x15, #0xd8]\n"
"fmla v28.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x20, x10]\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q14, [x19, x10]\n"
- "ldr x22, [x11, #0xe0]\n"
+ "ldr x23, [x15, #0xe0]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
"fmla v31.4s, v0.4s, v6.4s\n"
"ldr q0, [x16, #0xa0]\n"
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q11, [x21, x10]\n"
- "ldr x21, [x11, #0xe8]\n"
+ "ldr x22, [x15, #0xe8]\n"
"fmla v30.4s, v1.4s, v6.4s\n"
"fmla v31.4s, v1.4s, v10.4s\n"
"ldr q1, [x16, #0xb0]\n"
- "ldr q16, [x16, #0x140]\n"
+ "add x28, x28, #0x10\n"
"fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.4s, v2.4s, v9.4s\n"
- "ldr q12, [x20, x10]\n"
- "ldr x20, [x11, #0xf0]\n"
+ "ldr x21, [x15, #0xf0]\n"
"fmla v30.4s, v2.4s, v10.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"ldr q2, [x16, #0xc0]\n"
"fmla v28.4s, v3.4s, v9.4s\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q9, [x26, x10]\n"
- "ldr x26, [x11, #0x100]\n"
+ "ldr x27, [x15, #0x100]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"ldr q3, [x16, #0xd0]\n"
"fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x26, x10]\n"
"fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q13, [x25, x10]\n"
- "ldr q8, [x22, x10]\n"
+ "ldr q8, [x23, x10]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"fmla v31.4s, v4.4s, v14.4s\n"
"ldr q4, [x16, #0xe0]\n"
- "ldr x25, [x11, #0x108]\n"
+ "ldr x26, [x15, #0x108]\n"
"fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0x110]\n"
+ "ldr x25, [x15, #0x110]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v13.4s\n"
"ldr q0, [x16, #0xf0]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.4s, v1.4s, v10.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr x23, [x11, #0x118]\n"
+ "ldr x24, [x15, #0x118]\n"
"fmla v30.4s, v1.4s, v13.4s\n"
"fmla v31.4s, v1.4s, v5.4s\n"
"ldr q1, [x16, #0x100]\n"
"fmla v28.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x22, x10]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q10, [x21, x10]\n"
"fmla v30.4s, v2.4s, v5.4s\n"
"fmla v31.4s, v2.4s, v6.4s\n"
"ldr q2, [x16, #0x110]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x10]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x20, x10]\n"
"fmla v30.4s, v3.4s, v6.4s\n"
"fmla v31.4s, v3.4s, v8.4s\n"
"ldr q3, [x16, #0x120]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x10]\n"
"fmla v29.4s, v4.4s, v14.4s\n"
- "ldr q12, [x19, x10]\n"
"fmla v30.4s, v4.4s, v8.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"ldr q4, [x16, #0x130]\n"
"fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "ldr q9, [x26, x10]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x26, x10]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
- "ldr q11, [x25, x10]\n"
- "ldp x26, x25, [x11, #0x0]\n"
+ "ldr q0, [x16, #0x150]\n"
"fmla v28.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v5.4s\n"
- "ldr q0, [x16, #0x150]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x25, x10]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "ldr q12, [x24, x10]\n"
"ldr q1, [x16, #0x160]\n"
"fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr q5, [x27, x17]\n"
"fmla v29.4s, v2.4s, v6.4s\n"
- "ldr q5, [x26, x28]\n"
"fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x10]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q9, [x23, x10]\n"
- "ldp x24, x23, [x11, #0x10]\n"
+ "ldr q2, [x16, #0x170]\n"
"fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q6, [x26, x17]\n"
"fmla v29.4s, v3.4s, v8.4s\n"
- "ldr q6, [x25, x28]\n"
- "ldp x22, x21, [x11, #0x20]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr q7, [x25, x17]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldp x20, x19, [x11, #0x30]\n"
- "ldp x26, x25, [x11, #0x40]\n"
+ "ldr q3, [x16, #0x180]\n"
"fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x24, x17]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v18.4s\n"
- "ldr q7, [x24, x28]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldr q13, [x22, x17]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"fmla v31.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x23, x17]\n"
+ "ldr q4, [x16, #0x190]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
"fmax v29.4s, v29.4s, v18.4s\n"
- "ldr q8, [x23, x28]\n"
+ "ldr q11, [x21, x17]\n"
+ "ldr q12, [x20, x17]\n"
"fmax v30.4s, v30.4s, v18.4s\n"
"fmax v31.4s, v31.4s, v18.4s\n"
- "ldr q9, [x22, x28]\n"
- "ldr q13, [x21, x28]\n"
- "ldr q11, [x20, x28]\n"
- "ldr q12, [x19, x28]\n"
+ "ldp x27, x26, [x15, #0x40]\n"
+ "ldr q10, [x27, x17]\n"
"fmin v28.4s, v28.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v17.4s\n"
- "ldr q10, [x26, x28]\n"
- "ldr q14, [x25, x28]\n"
- "add x28, x28, #0x10\n"
- "cmp x28, x27, LSL #4\n"
+ "ldr q14, [x26, x17]\n"
+ "add x17, x17, #0x10\n"
+ "cmp x17, x9, LSL #4\n"
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
"add x10, x10, #0x10\n"
- "str q28, [x15, x9]\n"
- "str q29, [x14, x9]\n"
- "ldr q2, [x16, #0x170]\n"
- "ldr q3, [x16, #0x180]\n"
- "str q30, [x13, x9]\n"
- "ldr q4, [x16, #0x190]\n"
+ "str q28, [x14, x28]\n"
"add x16, x16, #0x1a0\n"
- "str q31, [x12, x9]\n"
+ "str q29, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q31, [x11, x28]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x24, [x11, #0x50]\n"
- "ldr q5, [x24, x10]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr q5, [x25, x10]\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
- "ldr x23, [x11, #0x58]\n"
- "ldr x22, [x11, #0x60]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x24, [x15, #0x58]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr x21, [x11, #0x68]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v31.4s, v1.4s, v13.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x20, [x11, #0x70]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x23, x10]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q9, [x22, x10]\n"
- "ldr q1, [x16, #0x10]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v31.4s, v2.4s, v5.4s\n"
- "ldr x19, [x11, #0x78]\n"
"ldr q2, [x16, #0x20]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x21, x10]\n"
- "ldr x26, [x11, #0x80]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"fmla v31.4s, v3.4s, v6.4s\n"
"ldr q3, [x16, #0x30]\n"
- "ldr x25, [x11, #0x88]\n"
+ "ldr x26, [x15, #0x88]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
- "ldr q12, [x20, x10]\n"
- "ldr q9, [x19, x10]\n"
+ "ldr q9, [x20, x10]\n"
"fmla v30.4s, v4.4s, v6.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"ldr q4, [x16, #0x40]\n"
- "ldr x24, [x11, #0x90]\n"
+ "ldr x25, [x15, #0x90]\n"
"fmla v28.4s, v0.4s, v7.4s\n"
"fmla v29.4s, v0.4s, v8.4s\n"
- "ldr x23, [x11, #0x98]\n"
- "ldr x22, [x11, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "ldr x23, [x15, #0xa0]\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"fmla v31.4s, v0.4s, v11.4s\n"
"ldr q0, [x16, #0x50]\n"
- "ldr x21, [x11, #0xa8]\n"
+ "ldr x22, [x15, #0xa8]\n"
"fmla v28.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x26, x10]\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q8, [x25, x10]\n"
- "ldr x20, [x11, #0xb0]\n"
+ "ldr x21, [x15, #0xb0]\n"
"fmla v30.4s, v1.4s, v11.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
"ldr q1, [x16, #0x60]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x27, x10]\n"
"fmla v29.4s, v2.4s, v5.4s\n"
- "ldr q13, [x26, x10]\n"
- "ldr x26, [x11, #0xc0]\n"
+ "ldr x27, [x15, #0xc0]\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v9.4s\n"
"ldr q2, [x16, #0x70]\n"
- "ldr x25, [x11, #0xc8]\n"
+ "ldr x26, [x15, #0xc8]\n"
"fmla v28.4s, v3.4s, v5.4s\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.4s, v3.4s, v6.4s\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0xd0]\n"
+ "ldr x25, [x15, #0xd0]\n"
"fmla v30.4s, v3.4s, v9.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
"ldr q3, [x16, #0x80]\n"
- "add x9, x9, #0x10\n"
+ "add x28, x28, #0x10\n"
"fmla v28.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr q10, [x22, x10]\n"
+ "ldr q10, [x23, x10]\n"
"fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v8.4s\n"
"ldr q4, [x16, #0x90]\n"
- "ldr x23, [x11, #0xd8]\n"
+ "ldr x24, [x15, #0xd8]\n"
"fmla v28.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x20, x10]\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "ldr q14, [x19, x10]\n"
- "ldr x22, [x11, #0xe0]\n"
+ "ldr x23, [x15, #0xe0]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
"fmla v31.4s, v0.4s, v6.4s\n"
"ldr q0, [x16, #0xa0]\n"
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x22, x10]\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "ldr q11, [x21, x10]\n"
- "ldr x21, [x11, #0xe8]\n"
+ "ldr x22, [x15, #0xe8]\n"
"fmla v30.4s, v1.4s, v6.4s\n"
"fmla v31.4s, v1.4s, v10.4s\n"
"ldr q1, [x16, #0xb0]\n"
"fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x21, x10]\n"
"fmla v29.4s, v2.4s, v9.4s\n"
- "ldr q12, [x20, x10]\n"
- "ldr x20, [x11, #0xf0]\n"
+ "ldr x21, [x15, #0xf0]\n"
"fmla v30.4s, v2.4s, v10.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"ldr q2, [x16, #0xc0]\n"
"fmla v28.4s, v3.4s, v9.4s\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "ldr q9, [x26, x10]\n"
- "ldr x26, [x11, #0x100]\n"
+ "ldr x27, [x15, #0x100]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"ldr q3, [x16, #0xd0]\n"
"fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x26, x10]\n"
"fmla v29.4s, v4.4s, v8.4s\n"
- "ldr q13, [x25, x10]\n"
- "ldr q8, [x22, x10]\n"
+ "ldr q8, [x23, x10]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"fmla v31.4s, v4.4s, v14.4s\n"
"ldr q4, [x16, #0xe0]\n"
- "ldr x25, [x11, #0x108]\n"
+ "ldr x26, [x15, #0x108]\n"
"fmla v28.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x25, x10]\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "ldr q5, [x24, x10]\n"
- "ldr x24, [x11, #0x110]\n"
+ "ldr x25, [x15, #0x110]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v13.4s\n"
"ldr q0, [x16, #0xf0]\n"
"fmla v28.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x24, x10]\n"
"fmla v29.4s, v1.4s, v10.4s\n"
- "ldr q6, [x23, x10]\n"
- "ldr x23, [x11, #0x118]\n"
+ "ldr x24, [x15, #0x118]\n"
"fmla v30.4s, v1.4s, v13.4s\n"
"fmla v31.4s, v1.4s, v5.4s\n"
"ldr q1, [x16, #0x100]\n"
"fmla v28.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x22, x10]\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q10, [x21, x10]\n"
"fmla v30.4s, v2.4s, v5.4s\n"
"fmla v31.4s, v2.4s, v6.4s\n"
"ldr q2, [x16, #0x110]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x10]\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "ldr q11, [x20, x10]\n"
"fmla v30.4s, v3.4s, v6.4s\n"
"fmla v31.4s, v3.4s, v8.4s\n"
"ldr q3, [x16, #0x120]\n"
"fmla v28.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x10]\n"
"fmla v29.4s, v4.4s, v14.4s\n"
- "ldr q12, [x19, x10]\n"
"fmla v30.4s, v4.4s, v8.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"ldr q4, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
"fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x27, x10]\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "ldr q9, [x26, x10]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x26, x10]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
- "ldr q11, [x25, x10]\n"
"fmla v28.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x25, x10]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "ldr q12, [x24, x10]\n"
"fmla v28.4s, v2.4s, v5.4s\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x10]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr q9, [x23, x10]\n"
"add x10, x10, #0x10\n"
"fmla v28.4s, v3.4s, v6.4s\n"
"fmla v29.4s, v3.4s, v8.4s\n"
@@ -530,86 +530,86 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmax v31.4s, v31.4s, v18.4s\n"
"fmin v28.4s, v28.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v17.4s\n"
- "str q28, [x15, x9]\n"
+ "str q28, [x14, x28]\n"
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
- "str q29, [x14, x9]\n"
- "str q30, [x13, x9]\n"
- "str q31, [x12, x9]\n"
+ "str q29, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q31, [x11, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 60f\n"
- "mov x9, x10\n"
- "ldr x28, [x11, #0x0]\n"
- "ldr x27, [x11, #0x8]\n"
- "ldr x26, [x11, #0x10]\n"
- "add x15, x15, x9\n"
- "add x14, x14, x9\n"
- "ldr x25, [x11, #0x18]\n"
- "ldr x24, [x11, #0x20]\n"
- "add x13, x13, x9\n"
- "add x12, x12, x9\n"
- "ldr x23, [x11, #0x28]\n"
- "ldr x22, [x11, #0x30]\n"
+ "ldr q16, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x28, x10\n"
+ "add x14, x14, x28\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, x28\n"
+ "ldr x9, [x15, #0x0]\n"
+ "ldr x28, [x15, #0x8]\n"
+ "add x9, x9, x10\n"
"add x28, x28, x10\n"
+ "ldr x27, [x15, #0x10]\n"
+ "ldr x26, [x15, #0x18]\n"
"add x27, x27, x10\n"
- "ldr x21, [x11, #0x38]\n"
- "ldr x20, [x11, #0x40]\n"
"add x26, x26, x10\n"
+ "ldr x25, [x15, #0x20]\n"
+ "ldr x24, [x15, #0x28]\n"
"add x25, x25, x10\n"
- "ldr x19, [x11, #0x48]\n"
- "ldr q16, [x16, #0x0]\n"
"add x24, x24, x10\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
"add x23, x23, x10\n"
- "ldr q0, [x16, #0x10]\n"
- "ldr q1, [x16, #0x20]\n"
"add x22, x22, x10\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
"add x21, x21, x10\n"
- "ldr q2, [x16, #0x30]\n"
- "ldr q3, [x16, #0x40]\n"
"add x20, x20, x10\n"
- "add x19, x19, x10\n"
- "ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v5.d }[0], [x28], #0x8\n"
- "ld1 { v6.d }[0], [x27], #0x8\n"
- "ld1 { v7.d }[0], [x26], #0x8\n"
- "ld1 { v8.d }[0], [x25], #0x8\n"
- "ld1 { v9.d }[0], [x24], #0x8\n"
- "ld1 { v13.d }[0], [x23], #0x8\n"
- "ld1 { v11.d }[0], [x22], #0x8\n"
- "ld1 { v12.d }[0], [x21], #0x8\n"
- "ld1 { v10.d }[0], [x20], #0x8\n"
- "ld1 { v14.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x9], #0x8\n"
+ "ld1 { v6.d }[0], [x28], #0x8\n"
+ "ld1 { v7.d }[0], [x27], #0x8\n"
+ "ld1 { v8.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x25], #0x8\n"
+ "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 5f\n"
- "ld1 { v5.s }[2], [x28], #0x4\n"
- "ld1 { v6.s }[2], [x27], #0x4\n"
- "ld1 { v7.s }[2], [x26], #0x4\n"
- "ld1 { v8.s }[2], [x25], #0x4\n"
- "ld1 { v9.s }[2], [x24], #0x4\n"
- "ld1 { v13.s }[2], [x23], #0x4\n"
- "ld1 { v11.s }[2], [x22], #0x4\n"
- "ld1 { v12.s }[2], [x21], #0x4\n"
- "ld1 { v10.s }[2], [x20], #0x4\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"b 5f\n"
"4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
- "ld1 { v5.s }[0], [x28], #0x4\n"
- "ld1 { v6.s }[0], [x27], #0x4\n"
- "ld1 { v7.s }[0], [x26], #0x4\n"
- "ld1 { v8.s }[0], [x25], #0x4\n"
- "ld1 { v9.s }[0], [x24], #0x4\n"
- "ld1 { v13.s }[0], [x23], #0x4\n"
- "ld1 { v11.s }[0], [x22], #0x4\n"
- "ld1 { v12.s }[0], [x21], #0x4\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "ld1 { v14.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x9], #0x4\n"
+ "ld1 { v6.s }[0], [x28], #0x4\n"
+ "ld1 { v7.s }[0], [x27], #0x4\n"
+ "ld1 { v8.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x25], #0x4\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
"mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "ldr x19, [x11, #0x50]\n"
- "add x19, x19, x10\n"
+ "ldr x20, [x15, #0x50]\n"
+ "add x20, x20, x10\n"
"mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
@@ -620,389 +620,389 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load input (1, 3): Bit 1: Unset
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x19, [x11, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v31.4s, v2.4s, v5.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"b 9f\n"
"8:" // Oddments: Load input (1, 4): Bit 1: Unset
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x19, [x11, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v31.4s, v3.4s, v6.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Load input (0, 5): Bit 1: Unset
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 5): Bit 1: End
+ "ldr q0, [x16, #0x0]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
"fmla v30.4s, v4.4s, v6.4s\n"
- "ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"fmla v28.4s, v0.4s, v7.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"fmla v29.4s, v0.4s, v8.4s\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 13f\n"
"12:" // Oddments: Load input (2, 1): Bit 1: Unset
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (2, 1): Bit 1: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v31.4s, v0.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v8.4s\n"
"fmla v29.4s, v1.4s, v13.4s\n"
"fmla v30.4s, v1.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 15f\n"
"14:" // Oddments: Load input (2, 2): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 2): Bit 1: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v31.4s, v1.4s, v12.4s\n"
"fmla v28.4s, v2.4s, v13.4s\n"
"fmla v29.4s, v2.4s, v5.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: Load input (2, 3): Bit 1: Unset
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (2, 3): Bit 1: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
"fmla v28.4s, v3.4s, v5.4s\n"
"fmla v29.4s, v3.4s, v6.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 19f\n"
"18:" // Oddments: Load input (2, 4): Bit 1: Unset
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (2, 4): Bit 1: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v31.4s, v3.4s, v13.4s\n"
"fmla v28.4s, v4.4s, v6.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v13.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v8.d }[0], [x19], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"b 21f\n"
"20:" // Oddments: Load input (2, 5): Bit 1: Unset
- "ld1 { v8.s }[0], [x19], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (2, 5): Bit 1: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v31.4s, v4.4s, v8.4s\n"
"fmla v28.4s, v0.4s, v14.4s\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"b 23f\n"
"22:" // Oddments: Load input (3, 0): Bit 1: Unset
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x19, [x11, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #1, 24f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"b 25f\n"
"24:" // Oddments: Load input (3, 1): Bit 1: Unset
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 1): Bit 1: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v31.4s, v0.4s, v6.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v30.4s, v1.4s, v6.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 27f\n"
"26:" // Oddments: Load input (3, 2): Bit 1: Unset
- "ld1 { v10.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (3, 2): Bit 1: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v31.4s, v1.4s, v10.4s\n"
"fmla v28.4s, v2.4s, v12.4s\n"
"fmla v29.4s, v2.4s, v9.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 29f\n"
"28:" // Oddments: Load input (3, 3): Bit 1: Unset
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (3, 3): Bit 1: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v9.4s\n"
"fmla v29.4s, v3.4s, v13.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 31f\n"
"30:" // Oddments: Load input (3, 4): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (3, 4): Bit 1: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"fmla v28.4s, v4.4s, v13.4s\n"
"fmla v29.4s, v4.4s, v8.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 32f\n"
- "ld1 { v14.d }[0], [x19], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v14.s }[2], [x19], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
"b 33f\n"
"32:" // Oddments: Load input (3, 5): Bit 1: Unset
- "ld1 { v14.s }[0], [x19], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (3, 5): Bit 1: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v31.4s, v4.4s, v14.4s\n"
"fmla v28.4s, v0.4s, v5.4s\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 34f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 35f\n"
"34:" // Oddments: Load input (4, 0): Bit 1: Unset
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (4, 0): Bit 1: End
- "ldr x19, [x11, #0xc8]\n"
+ "ldr x20, [x15, #0xc8]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #1, 36f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
- "ld1 { v13.s }[2], [x19], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
"b 37f\n"
"36:" // Oddments: Load input (4, 1): Bit 1: Unset
- "ld1 { v13.s }[0], [x19], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (4, 1): Bit 1: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0xd0]\n"
+ "ldr x20, [x15, #0xd0]\n"
"fmla v31.4s, v0.4s, v13.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v10.4s\n"
"fmla v30.4s, v1.4s, v13.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 38f\n"
- "ld1 { v5.d }[0], [x19], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
- "ld1 { v5.s }[2], [x19], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
"b 39f\n"
"38:" // Oddments: Load input (4, 2): Bit 1: Unset
- "ld1 { v5.s }[0], [x19], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
"39:" // Oddments: Load input (4, 2): Bit 1: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0xd8]\n"
+ "ldr x20, [x15, #0xd8]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
"fmla v28.4s, v2.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v5.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 40f\n"
- "ld1 { v6.d }[0], [x19], #0x8\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
- "ld1 { v6.s }[2], [x19], #0x4\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
"b 41f\n"
"40:" // Oddments: Load input (4, 3): Bit 1: Unset
- "ld1 { v6.s }[0], [x19], #0x4\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
"41:" // Oddments: Load input (4, 3): Bit 1: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0xe0]\n"
+ "ldr x20, [x15, #0xe0]\n"
"fmla v31.4s, v2.4s, v6.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v6.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 42f\n"
- "ld1 { v8.d }[0], [x19], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
- "ld1 { v8.s }[2], [x19], #0x4\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
"b 43f\n"
"42:" // Oddments: Load input (4, 4): Bit 1: Unset
- "ld1 { v8.s }[0], [x19], #0x4\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
"43:" // Oddments: Load input (4, 4): Bit 1: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0xe8]\n"
+ "ldr x20, [x15, #0xe8]\n"
"fmla v31.4s, v3.4s, v8.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v4.4s, v14.4s\n"
"fmla v30.4s, v4.4s, v8.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 44f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
- "ld1 { v10.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
"b 45f\n"
"44:" // Oddments: Load input (4, 5): Bit 1: Unset
- "ld1 { v10.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
"45:" // Oddments: Load input (4, 5): Bit 1: End
"ldr q0, [x16, #0x0]\n"
- "ldr x19, [x11, #0xf0]\n"
+ "ldr x20, [x15, #0xf0]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"fmla v28.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 46f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 47f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 47f\n"
"46:" // Oddments: Load input (5, 0): Bit 1: Unset
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"47:" // Oddments: Load input (5, 0): Bit 1: End
- "ldr x19, [x11, #0xf8]\n"
+ "ldr x20, [x15, #0xf8]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #1, 48f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 49f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 49f\n"
"48:" // Oddments: Load input (5, 1): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"49:" // Oddments: Load input (5, 1): Bit 1: End
"ldr q1, [x16, #0x0]\n"
- "ldr x19, [x11, #0x100]\n"
+ "ldr x20, [x15, #0x100]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
"fmla v28.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 50f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 51f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 51f\n"
"50:" // Oddments: Load input (5, 2): Bit 1: Unset
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"51:" // Oddments: Load input (5, 2): Bit 1: End
"ldr q2, [x16, #0x0]\n"
- "ldr x19, [x11, #0x108]\n"
+ "ldr x20, [x15, #0x108]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v5.4s\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 52f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 53f\n"
- "ld1 { v11.s }[2], [x19], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
"b 53f\n"
"52:" // Oddments: Load input (5, 3): Bit 1: Unset
- "ld1 { v11.s }[0], [x19], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"53:" // Oddments: Load input (5, 3): Bit 1: End
"ldr q3, [x16, #0x0]\n"
- "ldr x19, [x11, #0x110]\n"
+ "ldr x20, [x15, #0x110]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v6.4s\n"
"fmla v29.4s, v3.4s, v8.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 54f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 55f\n"
- "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
"b 55f\n"
"54:" // Oddments: Load input (5, 4): Bit 1: Unset
- "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
"55:" // Oddments: Load input (5, 4): Bit 1: End
"ldr q4, [x16, #0x0]\n"
- "ldr x19, [x11, #0x118]\n"
+ "ldr x20, [x15, #0x118]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
"fmla v28.4s, v4.4s, v8.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x19, x19, x10\n"
+ "add x20, x20, x10\n"
"tbz %x[n_channels], #1, 56f\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 57f\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
"b 57f\n"
"56:" // Oddments: Load input (5, 5): Bit 1: Unset
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
"57:" // Oddments: Load input (5, 5): Bit 1: End
"fmla v31.4s, v4.4s, v9.4s\n"
"fmax v28.4s, v28.4s, v18.4s\n"
@@ -1014,28 +1014,28 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmin v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v17.4s\n"
"tbz %x[n_channels], #1, 58f\n"
- "st1 { v28.d }[0], [x15], #0x8\n"
- "st1 { v29.d }[0], [x14], #0x8\n"
- "st1 { v30.d }[0], [x13], #0x8\n"
- "st1 { v31.d }[0], [x12], #0x8\n"
+ "st1 { v28.d }[0], [x14], #0x8\n"
+ "st1 { v29.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v31.d }[0], [x11], #0x8\n"
"tbz %x[n_channels], #0, 59f\n"
- "st1 { v28.s }[2], [x15], #0x4\n"
- "st1 { v29.s }[2], [x14], #0x4\n"
- "st1 { v30.s }[2], [x13], #0x4\n"
- "st1 { v31.s }[2], [x12], #0x4\n"
+ "st1 { v28.s }[2], [x14], #0x4\n"
+ "st1 { v29.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
"b 59f\n"
"58:" // Oddments: Store: Bit 1: Unset
- "st1 { v28.s }[0], [x15], #0x4\n"
- "st1 { v29.s }[0], [x14], #0x4\n"
- "st1 { v30.s }[0], [x13], #0x4\n"
- "st1 { v31.s }[0], [x12], #0x4\n"
+ "st1 { v28.s }[0], [x14], #0x4\n"
+ "st1 { v29.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v31.s }[0], [x11], #0x4\n"
"59:" // Oddments: Store: Bit 1: End
"60:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index c0b87ada75..0ea3a8fbed 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,336 +43,329 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ld1r { v4.4s }, [%x[minmax_vals]]\n"
- "add x19, %x[minmax_vals], #0x4\n"
+ "ld1r { v2.4s }, [%x[minmax_vals]]\n"
+ "lsr x12, %x[n_channels], #0x2\n"
+ "add x20, %x[minmax_vals], #0x4\n"
+ "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "ld1r { v3.4s }, [x19]\n"
- "lsr x10, %x[n_channels], #0x2\n"
- "cbz x10, 5f\n"
+ "cbz x12, 5f\n"
"1:" // Channel loop
- "movi v25.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q25, [%x[bias], x11]\n"
+ "ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
- "mov v24.16b, v25.16b\n"
- "ldr q23, [%x[params], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "mov v22.16b, v25.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, %x[n_points], #0x1\n"
- "mov v21.16b, v25.16b\n"
- "ldr q2, [x9, x11]\n"
- "mov v20.16b, v25.16b\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ldr q14, [x10, x11]\n"
+ "ldr q15, [x9, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldr q16, [x28, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr q17, [x27, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr q18, [x26, x11]\n"
+ "ldr q19, [x25, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr q20, [x24, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "mov v19.16b, v25.16b\n"
- "ldr q1, [x28, x11]\n"
- "mov v18.16b, v25.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v17.16b, v25.16b\n"
- "ldr q0, [x27, x11]\n"
- "mov v16.16b, v25.16b\n"
- "ldr q31, [x26, x11]\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ldr q30, [x25, x11]\n"
- "ldr q29, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ldr q28, [x23, x11]\n"
- "ldr q27, [x22, x11]\n"
- "ldr x21, [x20], #0x8\n"
- "ldr q26, [x21, x11]\n"
+ "ldr q21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ldr q22, [x22, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "fmla v25.4s, v2.4s, v23.4s\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "fmla v24.4s, v1.4s, v23.4s\n"
- "ldr q2, [x9, x11]\n"
- "fmla v22.4s, v0.4s, v23.4s\n"
- "fmla v21.4s, v31.4s, v23.4s\n"
- "ldr q1, [x28, x11]\n"
- "fmla v20.4s, v30.4s, v23.4s\n"
- "ldp x27, x26, [x20], #0x10\n"
- "fmla v19.4s, v29.4s, v23.4s\n"
- "fmla v18.4s, v28.4s, v23.4s\n"
- "ldr q0, [x27, x11]\n"
- "fmla v17.4s, v27.4s, v23.4s\n"
- "fmla v16.4s, v26.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x0]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "ldr q14, [x10, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "ldr q15, [x9, x11]\n"
+ "ldr q16, [x28, x11]\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "ldr q17, [x27, x11]\n"
+ "ldr q18, [x26, x11]\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "ldr q19, [x25, x11]\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "ldr q20, [x24, x11]\n"
"add %x[params], %x[params], #0x10\n"
- "ldr q31, [x26, x11]\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ldr q30, [x25, x11]\n"
- "ldr q29, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ldr q28, [x23, x11]\n"
- "ldr q27, [x22, x11]\n"
- "ldr x21, [x20], #0x8\n"
- "ldr q26, [x21, x11]\n"
+ "ldr q21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ldr q22, [x22, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "fmla v25.4s, v2.4s, v23.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "fmla v24.4s, v1.4s, v23.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "fmla v22.4s, v0.4s, v23.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
- "fmla v21.4s, v31.4s, v23.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "fmla v20.4s, v30.4s, v23.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmla v19.4s, v29.4s, v23.4s\n"
- "fmla v18.4s, v28.4s, v23.4s\n"
- "fmla v17.4s, v27.4s, v23.4s\n"
- "fmla v16.4s, v26.4s, v23.4s\n"
- "fmax v25.4s, v25.4s, v4.4s\n"
- "fmax v24.4s, v24.4s, v4.4s\n"
- "fmax v22.4s, v22.4s, v4.4s\n"
- "fmin v25.4s, v25.4s, v3.4s\n"
- "str q25, [x27, x11]\n"
- "fmin v24.4s, v24.4s, v3.4s\n"
- "fmin v22.4s, v22.4s, v3.4s\n"
- "str q24, [x26, x11]\n"
- "fmax v21.4s, v21.4s, v4.4s\n"
- "fmax v20.4s, v20.4s, v4.4s\n"
- "str q22, [x25, x11]\n"
- "fmax v19.4s, v19.4s, v4.4s\n"
- "fmax v18.4s, v18.4s, v4.4s\n"
- "fmin v21.4s, v21.4s, v3.4s\n"
- "str q21, [x24, x11]\n"
- "fmin v20.4s, v20.4s, v3.4s\n"
- "fmin v19.4s, v19.4s, v3.4s\n"
- "str q20, [x23, x11]\n"
- "fmin v18.4s, v18.4s, v3.4s\n"
- "fmax v17.4s, v17.4s, v4.4s\n"
- "str q19, [x22, x11]\n"
- "fmax v16.4s, v16.4s, v4.4s\n"
- "str q18, [x21, x11]\n"
- "fmin v17.4s, v17.4s, v3.4s\n"
- "fmin v16.4s, v16.4s, v3.4s\n"
- "str q17, [x20, x11]\n"
- "str q16, [x19, x11]\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v28.4s, v28.4s, v2.4s\n"
+ "fmax v29.4s, v29.4s, v2.4s\n"
+ "fmax v30.4s, v30.4s, v2.4s\n"
+ "fmax v31.4s, v31.4s, v2.4s\n"
+ "fmin v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v1.4s\n"
+ "str q23, [x28, x11]\n"
+ "fmin v25.4s, v25.4s, v1.4s\n"
+ "fmin v26.4s, v26.4s, v1.4s\n"
+ "str q24, [x27, x11]\n"
+ "fmin v27.4s, v27.4s, v1.4s\n"
+ "fmin v28.4s, v28.4s, v1.4s\n"
+ "str q25, [x26, x11]\n"
+ "fmin v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v1.4s\n"
+ "str q26, [x25, x11]\n"
+ "fmin v31.4s, v31.4s, v1.4s\n"
+ "str q27, [x24, x11]\n"
+ "str q28, [x23, x11]\n"
+ "str q29, [x22, x11]\n"
+ "str q30, [x21, x11]\n"
+ "str q31, [x20, x11]\n"
"add x11, x11, #0x10\n"
- "cmp x11, x10, LSL #4\n"
+ "cmp x11, x12, LSL #4\n"
"blt 1b\n"
"5:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 17f\n"
- "movi v25.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
"cbz %x[bias], 8f\n"
- "add x19, %x[bias], x11\n"
+ "add x20, %x[bias], x11\n"
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v25.d }[0], [x19], #0x8\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
- "ld1 { v25.s }[2], [x19], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 1: Unset
- "tbz %x[n_channels], #0, 7f\n"
- "ld1 { v25.s }[0], [x19], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load bias: Bit 1: End
-
"8:" // Oddments: Load bias: Done
- "mov v24.16b, v25.16b\n"
- "ldr q23, [%x[params], #0x0]\n"
- "mov x20, %x[inptrs]\n"
- "mov v22.16b, v25.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "add %x[params], %x[params], #0x10\n"
- "mov v21.16b, v25.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v20.16b, v25.16b\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr x22, [x21], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v19.16b, v25.16b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "mov v18.16b, v25.16b\n"
+ "mov v31.16b, v23.16b\n"
"add x28, x28, x11\n"
- "mov v17.16b, v25.16b\n"
- "ldp x23, x22, [x20], #0x10\n"
- "mov v16.16b, v25.16b\n"
"add x27, x27, x11\n"
- "ldr x21, [x20], #0x8\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 9f\n"
- "ldr d2, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d28, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
+ "ldr d14, [x10], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d19, [x25], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
- "ld1 { v2.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
- "ld1 { v29.s }[2], [x24], #0x4\n"
- "ld1 { v28.s }[2], [x23], #0x4\n"
- "ld1 { v27.s }[2], [x22], #0x4\n"
- "ld1 { v26.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x10], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
"b 10f\n"
"9:" // Oddments: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ldr s2, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
- "ldr s29, [x24], #0x4\n"
- "ldr s28, [x23], #0x4\n"
- "ldr s27, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s14, [x10], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s20, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
"10:" // Oddments: Load: Bit 1: End
- "subs x19, %x[n_points], #0x1\n"
+ "subs x20, %x[n_points], #0x1\n"
"ble 14f\n"
"11:" // Oddments: Planar loop
- "fmla v25.4s, v2.4s, v23.4s\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "ldr x22, [x21], #0x8\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "add x10, x10, x11\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
"add x9, x9, x11\n"
- "fmla v24.4s, v1.4s, v23.4s\n"
- "ldp x27, x26, [x20], #0x10\n"
- "fmla v22.4s, v0.4s, v23.4s\n"
- "ldp x25, x24, [x20], #0x10\n"
- "fmla v21.4s, v31.4s, v23.4s\n"
"add x28, x28, x11\n"
- "fmla v20.4s, v30.4s, v23.4s\n"
- "ldp x23, x22, [x20], #0x10\n"
- "fmla v19.4s, v29.4s, v23.4s\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x0]\n"
"add x27, x27, x11\n"
- "fmla v18.4s, v28.4s, v23.4s\n"
- "ldr x21, [x20], #0x8\n"
- "fmla v17.4s, v27.4s, v23.4s\n"
"add x26, x26, x11\n"
- "fmla v16.4s, v26.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x0]\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
- "ldr d2, [x9], #0x8\n"
- "ldr d1, [x28], #0x8\n"
- "ldr d0, [x27], #0x8\n"
- "ldr d31, [x26], #0x8\n"
- "ldr d30, [x25], #0x8\n"
- "ldr d29, [x24], #0x8\n"
- "ldr d28, [x23], #0x8\n"
- "ldr d27, [x22], #0x8\n"
- "ldr d26, [x21], #0x8\n"
+ "ldr d14, [x10], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d19, [x25], #0x8\n"
+ "ldr d20, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
- "ld1 { v2.s }[2], [x9], #0x4\n"
- "ld1 { v1.s }[2], [x28], #0x4\n"
- "ld1 { v0.s }[2], [x27], #0x4\n"
- "ld1 { v31.s }[2], [x26], #0x4\n"
- "ld1 { v30.s }[2], [x25], #0x4\n"
- "ld1 { v29.s }[2], [x24], #0x4\n"
- "ld1 { v28.s }[2], [x23], #0x4\n"
- "ld1 { v27.s }[2], [x22], #0x4\n"
- "ld1 { v26.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x10], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "ld1 { v19.s }[2], [x25], #0x4\n"
+ "ld1 { v20.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
"b 13f\n"
"12:" // Oddments: Planar loop: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 13f\n"
- "ldr s2, [x9], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s0, [x27], #0x4\n"
- "ldr s31, [x26], #0x4\n"
- "ldr s30, [x25], #0x4\n"
- "ldr s29, [x24], #0x4\n"
- "ldr s28, [x23], #0x4\n"
- "ldr s27, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
+ "ldr s14, [x10], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s20, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
"13:" // Oddments: Planar loop: Load: Bit 1: End
- "subs x19, x19, #0x1\n"
+ "subs x20, x20, #0x1\n"
"bgt 11b\n"
"14:" // Oddments: Planar tail
- "fmla v25.4s, v2.4s, v23.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "fmax v28.4s, v28.4s, v2.4s\n"
+ "fmax v29.4s, v29.4s, v2.4s\n"
"add x27, x27, x11\n"
- "fmla v24.4s, v1.4s, v23.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "fmla v22.4s, v0.4s, v23.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
"add x26, x26, x11\n"
- "fmla v21.4s, v31.4s, v23.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "fmla v20.4s, v30.4s, v23.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmax v30.4s, v30.4s, v2.4s\n"
+ "fmax v31.4s, v31.4s, v2.4s\n"
"add x25, x25, x11\n"
- "fmla v19.4s, v29.4s, v23.4s\n"
"add x24, x24, x11\n"
- "fmla v18.4s, v28.4s, v23.4s\n"
+ "fmin v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v1.4s\n"
"add x23, x23, x11\n"
- "fmla v17.4s, v27.4s, v23.4s\n"
"add x22, x22, x11\n"
- "fmla v16.4s, v26.4s, v23.4s\n"
+ "fmin v25.4s, v25.4s, v1.4s\n"
+ "fmin v26.4s, v26.4s, v1.4s\n"
"add x21, x21, x11\n"
- "fmax v25.4s, v25.4s, v4.4s\n"
"add x20, x20, x11\n"
- "fmax v24.4s, v24.4s, v4.4s\n"
- "add x19, x19, x11\n"
- "fmax v22.4s, v22.4s, v4.4s\n"
- "fmin v25.4s, v25.4s, v3.4s\n"
- "fmin v24.4s, v24.4s, v3.4s\n"
- "fmin v22.4s, v22.4s, v3.4s\n"
- "fmax v21.4s, v21.4s, v4.4s\n"
- "fmax v20.4s, v20.4s, v4.4s\n"
- "fmax v19.4s, v19.4s, v4.4s\n"
- "fmin v21.4s, v21.4s, v3.4s\n"
- "fmin v20.4s, v20.4s, v3.4s\n"
- "fmin v19.4s, v19.4s, v3.4s\n"
- "fmax v18.4s, v18.4s, v4.4s\n"
- "fmax v17.4s, v17.4s, v4.4s\n"
- "fmax v16.4s, v16.4s, v4.4s\n"
- "fmin v18.4s, v18.4s, v3.4s\n"
- "fmin v17.4s, v17.4s, v3.4s\n"
- "fmin v16.4s, v16.4s, v3.4s\n"
+ "fmin v27.4s, v27.4s, v1.4s\n"
+ "fmin v28.4s, v28.4s, v1.4s\n"
+ "fmin v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v1.4s\n"
+ "fmin v31.4s, v31.4s, v1.4s\n"
"tbz %x[n_channels], #1, 15f\n"
- "st1 { v25.d }[0], [x27], #0x8\n"
- "st1 { v24.d }[0], [x26], #0x8\n"
- "st1 { v22.d }[0], [x25], #0x8\n"
- "st1 { v21.d }[0], [x24], #0x8\n"
- "st1 { v20.d }[0], [x23], #0x8\n"
- "st1 { v19.d }[0], [x22], #0x8\n"
- "st1 { v18.d }[0], [x21], #0x8\n"
- "st1 { v17.d }[0], [x20], #0x8\n"
- "st1 { v16.d }[0], [x19], #0x8\n"
+ "st1 { v23.d }[0], [x28], #0x8\n"
+ "st1 { v24.d }[0], [x27], #0x8\n"
+ "st1 { v25.d }[0], [x26], #0x8\n"
+ "st1 { v26.d }[0], [x25], #0x8\n"
+ "st1 { v27.d }[0], [x24], #0x8\n"
+ "st1 { v28.d }[0], [x23], #0x8\n"
+ "st1 { v29.d }[0], [x22], #0x8\n"
+ "st1 { v30.d }[0], [x21], #0x8\n"
+ "st1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
- "st1 { v25.s }[2], [x27], #0x4\n"
- "st1 { v24.s }[2], [x26], #0x4\n"
- "st1 { v22.s }[2], [x25], #0x4\n"
- "st1 { v21.s }[2], [x24], #0x4\n"
- "st1 { v20.s }[2], [x23], #0x4\n"
- "st1 { v19.s }[2], [x22], #0x4\n"
- "st1 { v18.s }[2], [x21], #0x4\n"
- "st1 { v17.s }[2], [x20], #0x4\n"
- "st1 { v16.s }[2], [x19], #0x4\n"
+ "st1 { v23.s }[2], [x28], #0x4\n"
+ "st1 { v24.s }[2], [x27], #0x4\n"
+ "st1 { v25.s }[2], [x26], #0x4\n"
+ "st1 { v26.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x24], #0x4\n"
+ "st1 { v28.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "st1 { v31.s }[2], [x20], #0x4\n"
"b 16f\n"
"15:" // Oddments: Store: Bit 1: Unset
- "tbz %x[n_channels], #0, 16f\n"
- "st1 { v25.s }[0], [x27], #0x4\n"
- "st1 { v24.s }[0], [x26], #0x4\n"
- "st1 { v22.s }[0], [x25], #0x4\n"
- "st1 { v21.s }[0], [x24], #0x4\n"
- "st1 { v20.s }[0], [x23], #0x4\n"
- "st1 { v19.s }[0], [x22], #0x4\n"
- "st1 { v18.s }[0], [x21], #0x4\n"
- "st1 { v17.s }[0], [x20], #0x4\n"
- "st1 { v16.s }[0], [x19], #0x4\n"
+ "st1 { v23.s }[0], [x28], #0x4\n"
+ "st1 { v24.s }[0], [x27], #0x4\n"
+ "st1 { v25.s }[0], [x26], #0x4\n"
+ "st1 { v26.s }[0], [x25], #0x4\n"
+ "st1 { v27.s }[0], [x24], #0x4\n"
+ "st1 { v28.s }[0], [x23], #0x4\n"
+ "st1 { v29.s }[0], [x22], #0x4\n"
+ "st1 { v30.s }[0], [x21], #0x4\n"
+ "st1 { v31.s }[0], [x20], #0x4\n"
"16:" // Oddments: Store: Bit 1: End
-
"17:" // End
-
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 04a7abd3bd..69b3865a65 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,59 +41,59 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ldp x14, x13, [%x[outptrs], #0x0]\n"
- "add x12, %x[clamps], #0x4\n"
- "ldp x11, x10, [%x[outptrs], #0x10]\n"
- "mov x9, #0x0\n"
- "ldp x28, x27, [%x[outptrs], #0x20]\n"
- "mov x26, #0x0\n"
- "ldp x25, x24, [%x[outptrs], #0x30]\n"
- "lsr x23, %x[channel_multiplier], #0x2\n"
- "ldr x22, [%x[outptrs], #0x40]\n"
+ "ld1r { v24.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
+ "lsr x22, %x[channel_multiplier], #0x2\n"
+ "add x20, %x[clamps], #0x4\n"
"ldr q0, [x21, #0x0]\n"
"ldr q1, [x21, #0x10]\n"
+ "mov x21, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
- "ldr q4, [x19, #0x0]\n"
- "ldr q5, [x19, #0x10]\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q7, [x20, #0x10]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "ldr q6, [x21, #0x0]\n"
- "ldr q7, [x21, #0x10]\n"
"ldr q8, [x20, #0x0]\n"
"ldr q9, [x20, #0x10]\n"
- "ldr q10, [x19, #0x0]\n"
- "ldr q11, [x19, #0x10]\n"
- "ldr x19, [%x[inptrs], #0x30]\n"
- "ld1r { v24.4s }, [%x[clamps]]\n"
- "ld1r { v23.4s }, [x12]\n"
- "ldr q12, [x19, #0x0]\n"
- "ldr q13, [x19, #0x10]\n"
- "cbz x23, 3f\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "ldr q10, [x20, #0x0]\n"
+ "ldr q11, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "ldr q12, [x20, #0x0]\n"
+ "ldr q13, [x20, #0x10]\n"
+ "ldp x13, x12, [%x[outptrs], #0x0]\n"
+ "ldp x11, x10, [%x[outptrs], #0x10]\n"
+ "ldp x9, x28, [%x[outptrs], #0x20]\n"
+ "ldp x27, x26, [%x[outptrs], #0x30]\n"
+ "ldr x25, [%x[outptrs], #0x40]\n"
+ "cbz x22, 3f\n"
"ldr q14, [%x[params], #0x0]\n"
- "mov v15.16b, v14.16b\n"
"ldr q31, [%x[params], #0x10]\n"
- "subs x23, x23, #0x1\n"
- "mov v16.16b, v14.16b\n"
+ "subs x22, x22, #0x1\n"
+ "mov v15.16b, v14.16b\n"
"ldr q30, [%x[params], #0x20]\n"
- "mov v17.16b, v14.16b\n"
"ldr q29, [%x[params], #0x30]\n"
- "add %x[params], %x[params], #0x40\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
+ "add %x[params], %x[params], #0x40\n"
"mov v20.16b, v14.16b\n"
"mov v21.16b, v14.16b\n"
"mov v22.16b, v14.16b\n"
"beq 2f\n"
"1:" // Output channel complete vector loop
"fmla v14.4s, v31.4s, v0.s[0]\n"
- "add x9, x9, #0x4\n"
"fmla v15.4s, v31.4s, v0.s[2]\n"
- "subs x23, x23, #0x1\n"
+ "subs x22, x22, #0x1\n"
+ "add x21, x21, #0x4\n"
"fmla v16.4s, v31.4s, v1.s[0]\n"
"fmla v17.4s, v31.4s, v4.s[0]\n"
"fmla v18.4s, v31.4s, v4.s[2]\n"
@@ -174,51 +174,51 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr q30, [%x[params], #0x80]\n"
"fmla v14.4s, v29.4s, v4.s[2]\n"
"fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
"fmla v16.4s, v29.4s, v5.s[2]\n"
"fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "str q14, [x13, x14]\n"
+ "ldr q14, [%x[params], #0x60]\n"
"fmla v18.4s, v29.4s, v9.s[0]\n"
"fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
"fmla v20.4s, v29.4s, v12.s[2]\n"
"fmla v21.4s, v29.4s, v13.s[0]\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
"fmla v22.4s, v29.4s, v13.s[2]\n"
"ldr q29, [%x[params], #0x90]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "str q14, [x14, x26]\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "ldr q14, [%x[params], #0x60]\n"
- "add %x[params], %x[params], #0xa0\n"
"fmin v17.4s, v17.4s, v23.4s\n"
- "str q15, [x13, x26]\n"
+ "add %x[params], %x[params], #0xa0\n"
"fmin v18.4s, v18.4s, v23.4s\n"
"fmin v19.4s, v19.4s, v23.4s\n"
- "str q16, [x11, x26]\n"
"fmin v20.4s, v20.4s, v23.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "str q15, [x12, x14]\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
"fmax v17.4s, v17.4s, v24.4s\n"
- "str q17, [x10, x26]\n"
+ "str q16, [x11, x14]\n"
"fmax v18.4s, v18.4s, v24.4s\n"
"fmax v19.4s, v19.4s, v24.4s\n"
- "str q18, [x28, x26]\n"
+ "str q17, [x10, x14]\n"
"fmax v20.4s, v20.4s, v24.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "str q19, [x27, x26]\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "str q20, [x25, x26]\n"
"fmax v21.4s, v21.4s, v24.4s\n"
- "mov v15.16b, v14.16b\n"
- "str q21, [x24, x26]\n"
+ "str q18, [x9, x14]\n"
"fmax v22.4s, v22.4s, v24.4s\n"
+ "str q19, [x28, x14]\n"
+ "mov v15.16b, v14.16b\n"
+ "str q20, [x27, x14]\n"
"mov v16.16b, v14.16b\n"
- "str q22, [x22, x26]\n"
"mov v17.16b, v14.16b\n"
- "add x26, x26, #0x10\n"
+ "str q21, [x26, x14]\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
+ "str q22, [x25, x14]\n"
"mov v20.16b, v14.16b\n"
"mov v21.16b, v14.16b\n"
+ "add x14, x14, #0x10\n"
"mov v22.16b, v14.16b\n"
"bgt 1b\n"
"2:" // Output channel complete vector tail
@@ -303,58 +303,58 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v22.4s, v30.4s, v13.s[1]\n"
"fmla v14.4s, v29.4s, v4.s[2]\n"
"fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
"fmla v16.4s, v29.4s, v5.s[2]\n"
"fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
"fmla v18.4s, v29.4s, v9.s[0]\n"
"fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
"fmla v20.4s, v29.4s, v12.s[2]\n"
"fmla v21.4s, v29.4s, v13.s[0]\n"
- "fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
- "fmax v14.4s, v14.4s, v24.4s\n"
- "str q14, [x14, x26]\n"
- "fmax v15.4s, v15.4s, v24.4s\n"
- "fmax v16.4s, v16.4s, v24.4s\n"
- "str q15, [x13, x26]\n"
"fmin v17.4s, v17.4s, v23.4s\n"
+ "fmla v22.4s, v29.4s, v13.s[2]\n"
"fmin v18.4s, v18.4s, v23.4s\n"
- "str q16, [x11, x26]\n"
"fmin v19.4s, v19.4s, v23.4s\n"
"fmin v20.4s, v20.4s, v23.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "str q14, [x13, x14]\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
"fmax v17.4s, v17.4s, v24.4s\n"
- "str q17, [x10, x26]\n"
+ "str q15, [x12, x14]\n"
"fmax v18.4s, v18.4s, v24.4s\n"
"fmax v19.4s, v19.4s, v24.4s\n"
- "str q18, [x28, x26]\n"
+ "str q16, [x11, x14]\n"
"fmax v20.4s, v20.4s, v24.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "str q19, [x27, x26]\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
- "str q20, [x25, x26]\n"
"fmax v21.4s, v21.4s, v24.4s\n"
+ "str q17, [x10, x14]\n"
"fmax v22.4s, v22.4s, v24.4s\n"
- "str q21, [x24, x26]\n"
- "str q22, [x22, x26]\n"
- "add x26, x26, #0x10\n"
+ "str q18, [x9, x14]\n"
+ "str q19, [x28, x14]\n"
+ "str q20, [x27, x14]\n"
+ "str q21, [x26, x14]\n"
+ "str q22, [x25, x14]\n"
+ "add x14, x14, #0x10\n"
"3:" // Output channel oddments
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q14, [%x[params], #0x0]\n"
- "mov v15.16b, v14.16b\n"
"ldr q31, [%x[params], #0x10]\n"
+ "mov v15.16b, v14.16b\n"
"mov v16.16b, v14.16b\n"
"ldr q30, [%x[params], #0x20]\n"
- "mov v17.16b, v14.16b\n"
"ldr q29, [%x[params], #0x30]\n"
+ "mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
"mov v20.16b, v14.16b\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
"mov v21.16b, v14.16b\n"
"mov v22.16b, v14.16b\n"
"fmla v14.4s, v31.4s, v0.s[0]\n"
- "fmla v15.4s, v31.4s, v0.s[2]\n"
"fmla v16.4s, v31.4s, v1.s[0]\n"
"fmla v17.4s, v31.4s, v4.s[0]\n"
"fmla v18.4s, v31.4s, v4.s[2]\n"
@@ -434,98 +434,97 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v22.4s, v30.4s, v13.s[1]\n"
"fmla v14.4s, v29.4s, v4.s[2]\n"
"fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
"fmla v16.4s, v29.4s, v5.s[2]\n"
"fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
"fmla v18.4s, v29.4s, v9.s[0]\n"
"fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
"fmla v20.4s, v29.4s, v12.s[2]\n"
"fmla v21.4s, v29.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v23.4s\n"
"fmla v22.4s, v29.4s, v13.s[2]\n"
- "fmin v14.4s, v14.4s, v23.4s\n"
- "fmin v15.4s, v15.4s, v23.4s\n"
- "fmin v16.4s, v16.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v23.4s\n"
+ "fmin v19.4s, v19.4s, v23.4s\n"
+ "fmin v20.4s, v20.4s, v23.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
"fmax v14.4s, v14.4s, v24.4s\n"
"fmax v15.4s, v15.4s, v24.4s\n"
"fmax v16.4s, v16.4s, v24.4s\n"
- "fmin v17.4s, v17.4s, v23.4s\n"
- "fmin v18.4s, v18.4s, v23.4s\n"
- "fmin v19.4s, v19.4s, v23.4s\n"
"fmax v17.4s, v17.4s, v24.4s\n"
"fmax v18.4s, v18.4s, v24.4s\n"
"fmax v19.4s, v19.4s, v24.4s\n"
- "fmin v20.4s, v20.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v23.4s\n"
- "fmin v22.4s, v22.4s, v23.4s\n"
"fmax v20.4s, v20.4s, v24.4s\n"
"fmax v21.4s, v21.4s, v24.4s\n"
"fmax v22.4s, v22.4s, v24.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
- "add x19, x14, x26\n"
- "st1 { v14.d }[0], [x19]\n"
- "add x19, x13, x26\n"
- "st1 { v15.d }[0], [x19]\n"
- "add x19, x11, x26\n"
- "st1 { v16.d }[0], [x19]\n"
- "add x19, x10, x26\n"
- "st1 { v17.d }[0], [x19]\n"
- "add x19, x28, x26\n"
- "st1 { v18.d }[0], [x19]\n"
- "add x19, x27, x26\n"
- "st1 { v19.d }[0], [x19]\n"
- "add x19, x25, x26\n"
- "st1 { v20.d }[0], [x19]\n"
- "add x19, x24, x26\n"
- "st1 { v21.d }[0], [x19]\n"
- "add x19, x22, x26\n"
- "st1 { v22.d }[0], [x19]\n"
- "add x26, x26, #0x8\n"
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.d }[0], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.d }[0], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.d }[0], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.d }[0], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.d }[0], [x24]\n"
+ "add x14, x14, #0x8\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v20.d }[0], [x22]\n"
+ "st1 { v21.d }[0], [x21]\n"
+ "st1 { v22.d }[0], [x20]\n"
"tbz %x[channel_multiplier], #0, 5f\n"
- "add x19, x14, x26\n"
- "st1 { v14.s }[2], [x19]\n"
- "add x19, x13, x26\n"
- "st1 { v15.s }[2], [x19]\n"
- "add x19, x11, x26\n"
- "st1 { v16.s }[2], [x19]\n"
- "add x19, x10, x26\n"
- "st1 { v17.s }[2], [x19]\n"
- "add x19, x28, x26\n"
- "st1 { v18.s }[2], [x19]\n"
- "add x19, x27, x26\n"
- "st1 { v19.s }[2], [x19]\n"
- "add x19, x25, x26\n"
- "st1 { v20.s }[2], [x19]\n"
- "add x19, x24, x26\n"
- "st1 { v21.s }[2], [x19]\n"
- "add x19, x22, x26\n"
- "st1 { v22.s }[2], [x19]\n"
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.s }[2], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.s }[2], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.s }[2], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.s }[2], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v22.s }[2], [x20]\n"
"b 5f\n"
"4:" // Output channel oddments: Store: Bit 1: Unset
- "tbz %x[channel_multiplier], #0, 5f\n"
- "add x19, x14, x26\n"
- "st1 { v14.s }[0], [x19]\n"
- "add x19, x13, x26\n"
- "st1 { v15.s }[0], [x19]\n"
- "add x19, x11, x26\n"
- "st1 { v16.s }[0], [x19]\n"
- "add x19, x10, x26\n"
- "st1 { v17.s }[0], [x19]\n"
- "add x19, x28, x26\n"
- "st1 { v18.s }[0], [x19]\n"
- "add x19, x27, x26\n"
- "st1 { v19.s }[0], [x19]\n"
- "add x19, x25, x26\n"
- "st1 { v20.s }[0], [x19]\n"
- "add x19, x24, x26\n"
- "st1 { v21.s }[0], [x19]\n"
- "add x19, x22, x26\n"
- "st1 { v22.s }[0], [x19]\n"
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.s }[0], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.s }[0], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.s }[0], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.s }[0], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.s }[0], [x24]\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v20.s }[0], [x22]\n"
+ "st1 { v21.s }[0], [x21]\n"
+ "st1 { v22.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
"6:" // End
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 67fc09b2ee..50848cc2e8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,56 +42,56 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ldp x13, x12, [%x[outptrs], #0x0]\n"
- "add x11, %x[clamps], #0x4\n"
- "ldp x10, x9, [%x[outptrs], #0x10]\n"
- "mov x28, #0x0\n"
- "ldp x27, x26, [%x[outptrs], #0x20]\n"
- "mov x25, #0x0\n"
- "ldp x24, x23, [%x[outptrs], #0x30]\n"
- "lsr x22, %x[channel_multiplier], #0x2\n"
+ "ld1r { v21.4s }, [%x[clamps]]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
+ "lsr x22, %x[channel_multiplier], #0x2\n"
+ "add x20, %x[clamps], #0x4\n"
"ldr q0, [x21, #0x0]\n"
"ldr q1, [x21, #0x10]\n"
+ "mov x21, #0x0\n"
+ "mov x13, #0x0\n"
+ "ld1r { v20.4s }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
"ldr q2, [x20, #0x0]\n"
"ldr q3, [x20, #0x10]\n"
- "ldr q4, [x19, #0x0]\n"
- "ldr q5, [x19, #0x10]\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q7, [x20, #0x10]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "ldr q6, [x21, #0x0]\n"
- "ldr q7, [x21, #0x10]\n"
"ldr q8, [x20, #0x0]\n"
"ldr q9, [x20, #0x10]\n"
- "ldr q10, [x19, #0x0]\n"
- "ldr q11, [x19, #0x10]\n"
- "ld1r { v21.4s }, [%x[clamps]]\n"
- "ld1r { v20.4s }, [x11]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "ldr q10, [x20, #0x0]\n"
+ "ldr q11, [x20, #0x10]\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "ldp x28, x27, [%x[outptrs], #0x20]\n"
+ "ldp x26, x25, [%x[outptrs], #0x30]\n"
"cbz x22, 3f\n"
"ldr q12, [%x[params], #0x0]\n"
- "mov v13.16b, v12.16b\n"
"ldr q31, [%x[params], #0x10]\n"
"subs x22, x22, #0x1\n"
- "mov v14.16b, v12.16b\n"
+ "mov v13.16b, v12.16b\n"
"ldr q30, [%x[params], #0x20]\n"
- "mov v15.16b, v12.16b\n"
"ldr q29, [%x[params], #0x30]\n"
- "mov v16.16b, v12.16b\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v15.16b, v12.16b\n"
"ldr q28, [%x[params], #0x40]\n"
- "mov v17.16b, v12.16b\n"
"ldr q27, [%x[params], #0x50]\n"
- "add %x[params], %x[params], #0x60\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v12.16b\n"
+ "add %x[params], %x[params], #0x60\n"
"beq 2f\n"
"1:" // Output channel complete vector loop
"fmla v12.4s, v31.4s, v0.s[0]\n"
- "add x28, x28, #0x4\n"
"fmla v13.4s, v31.4s, v0.s[1]\n"
"subs x22, x22, #0x1\n"
+ "add x21, x21, #0x4\n"
"fmla v14.4s, v31.4s, v0.s[2]\n"
"fmla v15.4s, v31.4s, v0.s[3]\n"
"fmla v16.4s, v31.4s, v2.s[0]\n"
@@ -308,46 +308,46 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ldr q28, [%x[params], #0x180]\n"
"fmla v12.4s, v27.4s, v9.s[0]\n"
"fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
"fmla v14.4s, v27.4s, v9.s[2]\n"
"fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "str q12, [x12, x13]\n"
+ "ldr q12, [%x[params], #0x140]\n"
"fmla v16.4s, v27.4s, v11.s[0]\n"
"fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
"fmla v18.4s, v27.4s, v11.s[2]\n"
"fmla v19.4s, v27.4s, v11.s[3]\n"
"ldr q27, [%x[params], #0x190]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
"fmin v14.4s, v14.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "str q12, [x13, x25]\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "ldr q12, [%x[params], #0x140]\n"
- "add %x[params], %x[params], #0x1a0\n"
"fmin v15.4s, v15.4s, v20.4s\n"
- "str q13, [x12, x25]\n"
"fmin v16.4s, v16.4s, v20.4s\n"
+ "add %x[params], %x[params], #0x1a0\n"
"fmin v17.4s, v17.4s, v20.4s\n"
- "str q14, [x10, x25]\n"
"fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "str q13, [x11, x13]\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
"fmax v15.4s, v15.4s, v21.4s\n"
- "str q15, [x9, x25]\n"
+ "str q14, [x10, x13]\n"
"fmax v16.4s, v16.4s, v21.4s\n"
"fmax v17.4s, v17.4s, v21.4s\n"
- "str q16, [x27, x25]\n"
+ "str q15, [x9, x13]\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "str q17, [x26, x25]\n"
- "mov v13.16b, v12.16b\n"
- "str q18, [x24, x25]\n"
"fmax v19.4s, v19.4s, v21.4s\n"
+ "str q16, [x28, x13]\n"
+ "str q17, [x27, x13]\n"
+ "mov v13.16b, v12.16b\n"
"mov v14.16b, v12.16b\n"
- "str q19, [x23, x25]\n"
+ "str q18, [x26, x13]\n"
"mov v15.16b, v12.16b\n"
- "add x25, x25, #0x10\n"
"mov v16.16b, v12.16b\n"
+ "str q19, [x25, x13]\n"
"mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
+ "add x13, x13, #0x10\n"
"mov v19.16b, v12.16b\n"
"bgt 1b\n"
"2:" // Output channel complete vector tail
@@ -566,51 +566,51 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v19.4s, v28.4s, v11.s[2]\n"
"fmla v12.4s, v27.4s, v9.s[0]\n"
"fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
"fmla v14.4s, v27.4s, v9.s[2]\n"
"fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
"fmla v16.4s, v27.4s, v11.s[0]\n"
"fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
"fmla v18.4s, v27.4s, v11.s[2]\n"
"fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "str q12, [x13, x25]\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
- "str q13, [x12, x25]\n"
"fmin v15.4s, v15.4s, v20.4s\n"
"fmin v16.4s, v16.4s, v20.4s\n"
- "str q14, [x10, x25]\n"
"fmin v17.4s, v17.4s, v20.4s\n"
"fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "str q12, [x12, x13]\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
"fmax v15.4s, v15.4s, v21.4s\n"
- "str q15, [x9, x25]\n"
+ "str q13, [x11, x13]\n"
"fmax v16.4s, v16.4s, v21.4s\n"
"fmax v17.4s, v17.4s, v21.4s\n"
- "str q16, [x27, x25]\n"
+ "str q14, [x10, x13]\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
- "str q17, [x26, x25]\n"
"fmax v19.4s, v19.4s, v21.4s\n"
- "str q18, [x24, x25]\n"
- "str q19, [x23, x25]\n"
- "add x25, x25, #0x10\n"
+ "str q15, [x9, x13]\n"
+ "str q16, [x28, x13]\n"
+ "str q17, [x27, x13]\n"
+ "str q18, [x26, x13]\n"
+ "str q19, [x25, x13]\n"
+ "add x13, x13, #0x10\n"
"3:" // Output channel oddments
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q12, [%x[params], #0x0]\n"
- "mov v13.16b, v12.16b\n"
"ldr q31, [%x[params], #0x10]\n"
+ "mov v13.16b, v12.16b\n"
"mov v14.16b, v12.16b\n"
"ldr q30, [%x[params], #0x20]\n"
- "mov v15.16b, v12.16b\n"
"ldr q29, [%x[params], #0x30]\n"
+ "mov v15.16b, v12.16b\n"
"mov v16.16b, v12.16b\n"
"ldr q28, [%x[params], #0x40]\n"
- "mov v17.16b, v12.16b\n"
"ldr q27, [%x[params], #0x50]\n"
+ "mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v12.16b\n"
"fmla v12.4s, v31.4s, v0.s[0]\n"
@@ -828,89 +828,88 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v19.4s, v28.4s, v11.s[2]\n"
"fmla v12.4s, v27.4s, v9.s[0]\n"
"fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
"fmla v14.4s, v27.4s, v9.s[2]\n"
"fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
"fmla v16.4s, v27.4s, v11.s[0]\n"
"fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
"fmla v18.4s, v27.4s, v11.s[2]\n"
"fmla v19.4s, v27.4s, v11.s[3]\n"
- "fmin v12.4s, v12.4s, v20.4s\n"
- "fmin v13.4s, v13.4s, v20.4s\n"
- "fmin v14.4s, v14.4s, v20.4s\n"
- "fmax v12.4s, v12.4s, v21.4s\n"
- "fmax v13.4s, v13.4s, v21.4s\n"
- "fmax v14.4s, v14.4s, v21.4s\n"
"fmin v15.4s, v15.4s, v20.4s\n"
"fmin v16.4s, v16.4s, v20.4s\n"
"fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
"fmax v15.4s, v15.4s, v21.4s\n"
"fmax v16.4s, v16.4s, v21.4s\n"
"fmax v17.4s, v17.4s, v21.4s\n"
- "fmin v18.4s, v18.4s, v20.4s\n"
- "fmin v19.4s, v19.4s, v20.4s\n"
"fmax v18.4s, v18.4s, v21.4s\n"
"fmax v19.4s, v19.4s, v21.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
- "add x19, x13, x25\n"
- "st1 { v12.d }[0], [x19]\n"
- "add x19, x12, x25\n"
- "st1 { v13.d }[0], [x19]\n"
- "add x19, x10, x25\n"
- "st1 { v14.d }[0], [x19]\n"
- "add x19, x9, x25\n"
- "st1 { v15.d }[0], [x19]\n"
- "add x19, x27, x25\n"
- "st1 { v16.d }[0], [x19]\n"
- "add x19, x26, x25\n"
- "st1 { v17.d }[0], [x19]\n"
- "add x19, x24, x25\n"
- "st1 { v18.d }[0], [x19]\n"
- "add x19, x23, x25\n"
- "st1 { v19.d }[0], [x19]\n"
- "add x25, x25, #0x8\n"
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.d }[0], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.d }[0], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.d }[0], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.d }[0], [x24]\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "st1 { v19.d }[0], [x20]\n"
"tbz %x[channel_multiplier], #0, 5f\n"
- "add x19, x13, x25\n"
- "st1 { v12.s }[2], [x19]\n"
- "add x19, x12, x25\n"
- "st1 { v13.s }[2], [x19]\n"
- "add x19, x10, x25\n"
- "st1 { v14.s }[2], [x19]\n"
- "add x19, x9, x25\n"
- "st1 { v15.s }[2], [x19]\n"
- "add x19, x27, x25\n"
- "st1 { v16.s }[2], [x19]\n"
- "add x19, x26, x25\n"
- "st1 { v17.s }[2], [x19]\n"
- "add x19, x24, x25\n"
- "st1 { v18.s }[2], [x19]\n"
- "add x19, x23, x25\n"
- "st1 { v19.s }[2], [x19]\n"
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.s }[2], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.s }[2], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.s }[2], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "st1 { v19.s }[2], [x20]\n"
"b 5f\n"
"4:" // Output channel oddments: Store: Bit 1: Unset
- "tbz %x[channel_multiplier], #0, 5f\n"
- "add x19, x13, x25\n"
- "st1 { v12.s }[0], [x19]\n"
- "add x19, x12, x25\n"
- "st1 { v13.s }[0], [x19]\n"
- "add x19, x10, x25\n"
- "st1 { v14.s }[0], [x19]\n"
- "add x19, x9, x25\n"
- "st1 { v15.s }[0], [x19]\n"
- "add x19, x27, x25\n"
- "st1 { v16.s }[0], [x19]\n"
- "add x19, x26, x25\n"
- "st1 { v17.s }[0], [x19]\n"
- "add x19, x24, x25\n"
- "st1 { v18.s }[0], [x19]\n"
- "add x19, x23, x25\n"
- "st1 { v19.s }[0], [x19]\n"
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.s }[0], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.s }[0], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.s }[0], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.s }[0], [x24]\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "st1 { v19.s }[0], [x20]\n"
"5:" // Output channel oddments: Store: Bit 1: End
"6:" // End
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 46210e2964..c28f29c4f9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,807 +44,804 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ld1r { v11.4s }, [%x[minmax_vals]]\n"
+ "lsr x11, %x[n_output_channels], #0x2\n"
+ "add x20, %x[minmax_vals], #0x4\n"
+ "ld1r { v10.4s }, [x20]\n"
"mov x10, #0x0\n"
- "add x19, %x[minmax_vals], #0x4\n"
- "ld1r { v10.4s }, [x19]\n"
- "lsr x9, %x[n_output_channels], #0x2\n"
- "cbz x9, 8f\n"
+ "cbz x11, 8f\n"
"1:" // Output channel loop
- "movi v16.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x10, #0x2\n"
- "ldr q16, [%x[bias], x19]\n"
+ "lsl x20, x10, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "mov v9.16b, v16.16b\n"
- "ldr q8, [%x[weights], #0x0]\n"
- "mov x19, %x[inptrs]\n"
- "mov v7.16b, v16.16b\n"
- "ldp x24, x28, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov v6.16b, v16.16b\n"
- "ldr q5, [x24, #0x0]\n"
- "mov v4.16b, v16.16b\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr q8, [x23, #0x0]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
- "mov v3.16b, v16.16b\n"
- "ldr q2, [x24, #0x10]\n"
- "mov v1.16b, v16.16b\n"
- "ldr q0, [x28, #0x0]\n"
- "mov v31.16b, v16.16b\n"
- "ldr q30, [x28, #0x10]\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "cbz x20, 6f\n"
- "ldp x24, x28, [x19], #0x10\n"
- "ldr q20, [%x[weights], #0x0]\n"
- "subs x20, x20, #0x1\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x21, 6f\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q19, [x24, #0x0]\n"
- "ldr q18, [x24, #0x10]\n"
- "ldr q17, [x28, #0x0]\n"
- "ldr q16, [x28, #0x10]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q2, [x23, #0x10]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr q5, [x24, #0x0]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr q2, [x24, #0x10]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "ldr q0, [x28, #0x0]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "ldr q30, [x28, #0x10]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "ldr q8, [%x[weights], #0x0]\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "ldr q19, [x24, #0x0]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "ldr q18, [x24, #0x10]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "ldr q17, [x28, #0x0]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
- "ldr q16, [x28, #0x10]\n"
- "ldr q20, [%x[weights], #0x10]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x23, #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "ldr q2, [x23, #0x10]\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "ldr q0, [x9, #0x10]\n"
+ "ldr q4, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "lsl x27, x10, #0x2\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
- "fmin v9.4s, v9.4s, v10.4s\n"
- "fmin v7.4s, v7.4s, v10.4s\n"
- "fmin v6.4s, v6.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v11.4s\n"
- "str q9, [x19, x27]\n"
- "fmax v7.4s, v7.4s, v11.4s\n"
- "fmax v6.4s, v6.4s, v11.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v4.4s, v4.4s, v10.4s\n"
- "str q7, [x20, x27]\n"
- "fmin v3.4s, v3.4s, v10.4s\n"
- "fmin v1.4s, v1.4s, v10.4s\n"
- "str q6, [x21, x27]\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "str q4, [x22, x27]\n"
- "fmax v1.4s, v1.4s, v11.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q3, [x23, x27]\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q1, [x24, x27]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "str q31, [x25, x27]\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q29, [x26, x27]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "str q28, [x19, x27]\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q27, [x20, x27]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "str q26, [x21, x27]\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "str q25, [x22, x27]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "str q24, [x23, x27]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "lsl x28, x10, #0x2\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
+ "fmin v16.4s, v16.4s, v10.4s\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "fmin v17.4s, v17.4s, v10.4s\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmin v18.4s, v18.4s, v10.4s\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "fmin v19.4s, v19.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmin v20.4s, v20.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
"fmin v21.4s, v21.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q23, [x24, x27]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v11.4s\n"
+ "str q16, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v11.4s\n"
+ "fmax v19.4s, v19.4s, v11.4s\n"
+ "str q17, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v11.4s\n"
"fmax v21.4s, v21.4s, v11.4s\n"
- "str q22, [x25, x27]\n"
- "str q21, [x26, x27]\n"
+ "str q18, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q19, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q20, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "str q21, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q22, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v10.4s\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "str q23, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.4s, v30.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "lsl x27, x10, #0x2\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr q5, [x24, #0x0]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr q2, [x24, #0x10]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "ldr q0, [x28, #0x0]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "ldr q30, [x28, #0x10]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "ldr q8, [%x[weights], #0x0]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "lsl x28, x10, #0x2\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x23, #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "fmin v9.4s, v9.4s, v10.4s\n"
- "fmin v7.4s, v7.4s, v10.4s\n"
- "fmin v6.4s, v6.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v11.4s\n"
- "str q9, [x19, x27]\n"
- "fmax v7.4s, v7.4s, v11.4s\n"
- "fmax v6.4s, v6.4s, v11.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v4.4s, v4.4s, v10.4s\n"
- "str q7, [x20, x27]\n"
- "fmin v3.4s, v3.4s, v10.4s\n"
- "fmin v1.4s, v1.4s, v10.4s\n"
- "str q6, [x21, x27]\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "str q4, [x22, x27]\n"
- "fmax v1.4s, v1.4s, v11.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q3, [x23, x27]\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q1, [x24, x27]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "str q31, [x25, x27]\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q29, [x26, x27]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "str q28, [x19, x27]\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q27, [x20, x27]\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "str q26, [x21, x27]\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "str q25, [x22, x27]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "str q24, [x23, x27]\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmin v16.4s, v16.4s, v10.4s\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmin v17.4s, v17.4s, v10.4s\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmin v18.4s, v18.4s, v10.4s\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmin v19.4s, v19.4s, v10.4s\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmin v20.4s, v20.4s, v10.4s\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
"fmin v21.4s, v21.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q23, [x24, x27]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v11.4s\n"
+ "str q16, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v11.4s\n"
+ "fmax v19.4s, v19.4s, v11.4s\n"
+ "str q17, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v11.4s\n"
"fmax v21.4s, v21.4s, v11.4s\n"
- "str q22, [x25, x27]\n"
- "str q21, [x26, x27]\n"
- "b 7f\n"
- "6:" // Output channel loop: Single kernel point
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "lsl x27, x10, #0x2\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "fmin v9.4s, v9.4s, v10.4s\n"
- "fmin v7.4s, v7.4s, v10.4s\n"
- "fmin v6.4s, v6.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v11.4s\n"
- "str q9, [x19, x27]\n"
- "fmax v7.4s, v7.4s, v11.4s\n"
- "fmax v6.4s, v6.4s, v11.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmin v4.4s, v4.4s, v10.4s\n"
- "str q7, [x20, x27]\n"
- "fmin v3.4s, v3.4s, v10.4s\n"
- "fmin v1.4s, v1.4s, v10.4s\n"
- "str q6, [x21, x27]\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "str q4, [x22, x27]\n"
- "fmax v1.4s, v1.4s, v11.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "str q3, [x23, x27]\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "str q1, [x24, x27]\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "str q31, [x25, x27]\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "str q29, [x26, x27]\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "str q28, [x19, x27]\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "str q27, [x20, x27]\n"
+ "str q18, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q19, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"fmin v24.4s, v24.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "str q26, [x21, x27]\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q20, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "str q21, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q22, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v10.4s\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "str q23, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
"fmax v24.4s, v24.4s, v11.4s\n"
- "str q25, [x22, x27]\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
- "str q24, [x23, x27]\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.4s, v30.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmin v16.4s, v16.4s, v10.4s\n"
+ "lsl x28, x10, #0x2\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmin v17.4s, v17.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmin v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmin v19.4s, v19.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmin v20.4s, v20.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
"fmin v21.4s, v21.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "str q23, [x24, x27]\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmax v16.4s, v16.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v11.4s\n"
+ "str q16, [x20, x28]\n"
+ "fmax v18.4s, v18.4s, v11.4s\n"
+ "fmax v19.4s, v19.4s, v11.4s\n"
+ "str q17, [x21, x28]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v20.4s, v20.4s, v11.4s\n"
"fmax v21.4s, v21.4s, v11.4s\n"
- "str q22, [x25, x27]\n"
- "str q21, [x26, x27]\n"
+ "str q18, [x22, x28]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q19, [x23, x28]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q20, [x24, x28]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "str q21, [x25, x28]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q22, [x26, x28]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin v30.4s, v30.4s, v10.4s\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "str q23, [x27, x28]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "str q24, [x20, x28]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q25, [x21, x28]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "str q26, [x22, x28]\n"
+ "fmax v30.4s, v30.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q27, [x23, x28]\n"
+ "str q28, [x24, x28]\n"
+ "str q29, [x25, x28]\n"
+ "str q30, [x26, x28]\n"
+ "str q31, [x27, x28]\n"
"7:" // Output channel loop: Done
"add x10, x10, #0x4\n"
- "cmp x10, x9, LSL #2\n"
+ "cmp x10, x11, LSL #2\n"
"blt 1b\n"
"tst %x[n_output_channels], #0x3\n"
"beq 19f\n"
"8:" // Output channel oddments
- "movi v16.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
"cbz %x[bias], 11f\n"
- "add x19, %x[bias], x10, LSL #2\n"
+ "add x20, %x[bias], x10, LSL #2\n"
"tbz %x[n_output_channels], #1, 9f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 10f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v31.s }[2], [x20]\n"
"b 10f\n"
"9:" // Output channel oddments: Load bias: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 10f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "ld1 { v31.s }[0], [x20]\n"
"10:" // Output channel oddments: Load bias: Bit 1: End
-
"11:" // Output channel oddments: Load bias: Done
- "mov v9.16b, v16.16b\n"
- "ldr q8, [%x[weights], #0x0]\n"
- "mov x19, %x[inptrs]\n"
- "mov v7.16b, v16.16b\n"
- "ldp x24, x28, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "mov v6.16b, v16.16b\n"
- "ldr q5, [x24, #0x0]\n"
- "mov v4.16b, v16.16b\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr q8, [x23, #0x0]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "ldr q6, [x9, #0x0]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
- "mov v3.16b, v16.16b\n"
- "ldr q2, [x24, #0x10]\n"
- "mov v1.16b, v16.16b\n"
- "ldr q0, [x28, #0x0]\n"
- "mov v31.16b, v16.16b\n"
- "ldr q30, [x28, #0x10]\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "cbz x20, 15f\n"
- "ldp x24, x28, [x19], #0x10\n"
- "ldr q20, [%x[weights], #0x0]\n"
- "subs x20, x20, #0x1\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x21, 15f\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
"add %x[weights], %x[weights], #0x10\n"
- "ldr q19, [x24, #0x0]\n"
- "ldr q18, [x24, #0x10]\n"
- "ldr q17, [x28, #0x0]\n"
- "ldr q16, [x28, #0x10]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q2, [x23, #0x10]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
"beq 13f\n"
"12:" // Output channel oddments: Kernel loop
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr q5, [x24, #0x0]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr q2, [x24, #0x10]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "ldr q0, [x28, #0x0]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "ldr q30, [x28, #0x10]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "ldr q8, [%x[weights], #0x0]\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "ldr q19, [x24, #0x0]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "ldr q18, [x24, #0x10]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "ldr q17, [x28, #0x0]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
- "ldr q16, [x28, #0x10]\n"
- "ldr q20, [%x[weights], #0x10]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x23, #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "ldr q2, [x23, #0x10]\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "ldr q0, [x9, #0x10]\n"
+ "ldr q4, [%x[weights], #0x10]\n"
"add %x[weights], %x[weights], #0x20\n"
"bgt 12b\n"
"13:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 14f\n"
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
"b 16f\n"
"14:" // Output channel oddments: Odd tail
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "ldp x24, x28, [x19], #0x10\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "ldr q5, [x24, #0x0]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "ldr q2, [x24, #0x10]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "ldr q0, [x28, #0x0]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
- "ldr q30, [x28, #0x10]\n"
- "fmla v9.4s, v20.4s, v19.s[0]\n"
- "ldr q8, [%x[weights], #0x0]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "ldp x23, x9, [x20], #0x10\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x23, #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x23, #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q5, [x9, #0x10]\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "fmla v16.4s, v4.4s, v3.s[0]\n"
+ "fmla v17.4s, v4.4s, v3.s[1]\n"
"add %x[weights], %x[weights], #0x10\n"
- "fmla v7.4s, v20.4s, v19.s[1]\n"
- "fmla v6.4s, v20.4s, v19.s[2]\n"
- "fmla v4.4s, v20.4s, v19.s[3]\n"
- "fmla v3.4s, v20.4s, v18.s[0]\n"
- "fmla v1.4s, v20.4s, v18.s[1]\n"
- "fmla v31.4s, v20.4s, v18.s[2]\n"
- "fmla v29.4s, v20.4s, v18.s[3]\n"
- "fmla v28.4s, v20.4s, v17.s[0]\n"
- "fmla v27.4s, v20.4s, v17.s[1]\n"
- "fmla v26.4s, v20.4s, v17.s[2]\n"
- "fmla v25.4s, v20.4s, v17.s[3]\n"
- "fmla v24.4s, v20.4s, v16.s[0]\n"
- "fmla v23.4s, v20.4s, v16.s[1]\n"
- "fmla v22.4s, v20.4s, v16.s[2]\n"
- "fmla v21.4s, v20.4s, v16.s[3]\n"
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmla v18.4s, v4.4s, v3.s[2]\n"
+ "fmla v19.4s, v4.4s, v3.s[3]\n"
+ "fmla v20.4s, v4.4s, v2.s[0]\n"
+ "fmla v21.4s, v4.4s, v2.s[1]\n"
+ "fmla v22.4s, v4.4s, v2.s[2]\n"
+ "fmla v23.4s, v4.4s, v2.s[3]\n"
+ "fmla v24.4s, v4.4s, v1.s[0]\n"
+ "fmla v25.4s, v4.4s, v1.s[1]\n"
+ "fmla v26.4s, v4.4s, v1.s[2]\n"
+ "fmla v27.4s, v4.4s, v1.s[3]\n"
+ "fmla v28.4s, v4.4s, v0.s[0]\n"
+ "fmla v29.4s, v4.4s, v0.s[1]\n"
+ "fmla v30.4s, v4.4s, v0.s[2]\n"
+ "fmla v31.4s, v4.4s, v0.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
"b 16f\n"
"15:" // Output channel oddments: Single kernel point
- "fmla v9.4s, v8.4s, v5.s[0]\n"
- "fmla v7.4s, v8.4s, v5.s[1]\n"
- "fmla v6.4s, v8.4s, v5.s[2]\n"
- "fmla v4.4s, v8.4s, v5.s[3]\n"
- "fmla v3.4s, v8.4s, v2.s[0]\n"
- "fmla v1.4s, v8.4s, v2.s[1]\n"
- "fmla v31.4s, v8.4s, v2.s[2]\n"
- "fmla v29.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v0.s[0]\n"
- "fmla v27.4s, v8.4s, v0.s[1]\n"
- "fmla v26.4s, v8.4s, v0.s[2]\n"
- "fmla v25.4s, v8.4s, v0.s[3]\n"
- "fmla v24.4s, v8.4s, v30.s[0]\n"
- "fmla v23.4s, v8.4s, v30.s[1]\n"
- "fmla v22.4s, v8.4s, v30.s[2]\n"
- "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
"16:" // Output channel oddments: Done
- "fmin v9.4s, v9.4s, v10.4s\n"
- "fmin v7.4s, v7.4s, v10.4s\n"
- "fmin v6.4s, v6.4s, v10.4s\n"
- "fmin v4.4s, v4.4s, v10.4s\n"
- "fmax v9.4s, v9.4s, v11.4s\n"
- "fmax v7.4s, v7.4s, v11.4s\n"
- "fmax v6.4s, v6.4s, v11.4s\n"
- "fmax v4.4s, v4.4s, v11.4s\n"
- "fmin v3.4s, v3.4s, v10.4s\n"
- "fmin v1.4s, v1.4s, v10.4s\n"
- "fmin v31.4s, v31.4s, v10.4s\n"
- "fmax v3.4s, v3.4s, v11.4s\n"
- "fmax v1.4s, v1.4s, v11.4s\n"
- "fmax v31.4s, v31.4s, v11.4s\n"
- "fmin v29.4s, v29.4s, v10.4s\n"
- "fmin v28.4s, v28.4s, v10.4s\n"
- "fmin v27.4s, v27.4s, v10.4s\n"
- "fmax v29.4s, v29.4s, v11.4s\n"
- "fmax v28.4s, v28.4s, v11.4s\n"
- "fmax v27.4s, v27.4s, v11.4s\n"
- "fmin v26.4s, v26.4s, v10.4s\n"
- "fmin v25.4s, v25.4s, v10.4s\n"
- "fmin v24.4s, v24.4s, v10.4s\n"
- "fmax v26.4s, v26.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v11.4s\n"
- "fmax v24.4s, v24.4s, v11.4s\n"
- "fmin v23.4s, v23.4s, v10.4s\n"
- "fmin v22.4s, v22.4s, v10.4s\n"
+ "fmin v16.4s, v16.4s, v10.4s\n"
+ "fmin v17.4s, v17.4s, v10.4s\n"
+ "fmin v18.4s, v18.4s, v10.4s\n"
+ "fmin v19.4s, v19.4s, v10.4s\n"
+ "fmin v20.4s, v20.4s, v10.4s\n"
"fmin v21.4s, v21.4s, v10.4s\n"
- "fmax v23.4s, v23.4s, v11.4s\n"
- "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "fmin v30.4s, v30.4s, v10.4s\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v11.4s\n"
+ "fmax v17.4s, v17.4s, v11.4s\n"
+ "fmax v18.4s, v18.4s, v11.4s\n"
+ "fmax v19.4s, v19.4s, v11.4s\n"
+ "fmax v20.4s, v20.4s, v11.4s\n"
"fmax v21.4s, v21.4s, v11.4s\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "fmax v30.4s, v30.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
"tbz %x[n_output_channels], #1, 17f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #2\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v9.d }[0], [x19]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v7.d }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v6.d }[0], [x21]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v4.d }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v3.d }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v1.d }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v31.d }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #2\n"
- "st1 { v29.d }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v16.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v28.d }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v27.d }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v26.d }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v25.d }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v24.d }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v23.d }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v22.d }[0], [x25]\n"
+ "st1 { v23.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #2\n"
"add x10, x10, #0x2\n"
- "st1 { v21.d }[0], [x26]\n"
+ "st1 { v24.d }[0], [x20]\n"
+ "st1 { v25.d }[0], [x21]\n"
+ "st1 { v26.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x23]\n"
+ "st1 { v28.d }[0], [x24]\n"
+ "st1 { v29.d }[0], [x25]\n"
+ "st1 { v30.d }[0], [x26]\n"
+ "st1 { v31.d }[0], [x27]\n"
"tbz %x[n_output_channels], #0, 18f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #2\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v9.s }[2], [x19]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v7.s }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v6.s }[2], [x21]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v4.s }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v3.s }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v1.s }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v31.s }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #2\n"
- "st1 { v29.s }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v16.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v28.s }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v27.s }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v26.s }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v25.s }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v24.s }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v23.s }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v21.s }[2], [x26]\n"
+ "st1 { v23.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x26]\n"
+ "st1 { v31.s }[2], [x27]\n"
"b 18f\n"
"17:" // Output channel oddments: Done: Store: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 18f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x10, LSL #2\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v9.s }[0], [x19]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v7.s }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v6.s }[0], [x21]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v4.s }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v3.s }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v1.s }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v31.s }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x10, LSL #2\n"
- "st1 { v29.s }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v16.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x10, LSL #2\n"
- "st1 { v28.s }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x10, LSL #2\n"
- "st1 { v27.s }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x10, LSL #2\n"
- "st1 { v26.s }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x10, LSL #2\n"
- "st1 { v25.s }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x10, LSL #2\n"
- "st1 { v24.s }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x10, LSL #2\n"
- "st1 { v23.s }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x10, LSL #2\n"
- "st1 { v22.s }[0], [x25]\n"
- "st1 { v21.s }[0], [x26]\n"
+ "st1 { v23.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v24.s }[0], [x20]\n"
+ "st1 { v25.s }[0], [x21]\n"
+ "st1 { v26.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x23]\n"
+ "st1 { v28.s }[0], [x24]\n"
+ "st1 { v29.s }[0], [x25]\n"
+ "st1 { v30.s }[0], [x26]\n"
+ "st1 { v31.s }[0], [x27]\n"
"18:" // Output channel oddments: Done: Store: Bit 1: End
"19:" // Done
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index f8245fc5d9..fda88f94bb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,1282 +41,1622 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x13, x12, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "ldp x11, x10, [%x[inptrs], #0x10]\n"
- "mov x19, #0x1\n"
- "ldp x9, x28, [%x[inptrs], #0x20]\n"
- "orr x19, x19, #0x100\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "orr x19, x19, #0x10000\n"
- "dup v11.4s, w19\n"
- "ldp x25, x24, [%x[outptrs], #0x0]\n"
- "mov x23, #0x0\n"
+ "mov x20, #0x1\n"
+ "orr x20, x20, #0x100\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "orr x20, x20, #0x10000\n"
+ "lsr x11, %x[n_channels], #0x4\n"
+ "dup v14.4s, w20\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "ldp x24, x23, [%x[outptrs], #0x0]\n"
"ldp x22, x21, [%x[outptrs], #0x10]\n"
- "lsr x20, %x[n_channels], #0x4\n"
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v9.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.4s }, [x19]\n"
- "cbz x20, 2f\n"
+ "cbz x11, 3f\n"
+ "ldr q9, [x15, x28]\n"
+ "ldr q8, [x14, x28]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q7, [x13, x28]\n"
+ "ldr q6, [x12, x28]\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "ldr q4, [x10, x28]\n"
+ "ldr q3, [x9, x28]\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
+ "ldr q2, [x26, x28]\n"
+ "ldr q1, [x25, x28]\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldr q29, [x15, x28]\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q28, [x14, x28]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldr q27, [x13, x28]\n"
+ "ldr q26, [x12, x28]\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldr q24, [x10, x28]\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "ldr q23, [x9, x28]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "ldr q22, [x26, x28]\n"
+ "ldr q21, [x25, x28]\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ "beq 2f\n"
"1:" // Loop
- "movi v15.4s, #0x0\n"
- "ldr q27, [x13, x23]\n"
- "subs x20, x20, #0x1\n"
- "movi v10.4s, #0x0\n"
- "ldr q1, [x12, x23]\n"
- "ldp x13, x12, [%x[inptrs], #0x40]\n"
- "ldr q25, [x11, x23]\n"
- "zip1 v7.16b, v27.16b, v25.16b\n"
- "ldr q23, [x10, x23]\n"
- "zip2 v5.16b, v27.16b, v25.16b\n"
- "ldp x11, x10, [%x[inptrs], #0x50]\n"
- "ldr q31, [x9, x23]\n"
- "zip1 v8.16b, v1.16b, v23.16b\n"
- "ldr q28, [x28, x23]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldp x9, x28, [%x[inptrs], #0x60]\n"
- "zip1 v6.16b, v7.16b, v8.16b\n"
- "ldr q21, [x27, x23]\n"
- "zip2 v8.16b, v7.16b, v8.16b\n"
- "ldr q26, [x26, x23]\n"
- "zip1 v7.16b, v5.16b, v3.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "zip2 v5.16b, v5.16b, v3.16b\n"
- "ldr q24, [x13, x23]\n"
- "ldr q22, [x12, x23]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldp x13, x12, [%x[inptrs], #0x0]\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "ldr q20, [x11, x23]\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "ldr q16, [x10, x23]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "ldp x11, x10, [%x[inptrs], #0x10]\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "ldr q19, [x9, x23]\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "ldr q0, [x28, x23]\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "ldp x9, x28, [%x[inptrs], #0x20]\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "ldr q18, [x27, x23]\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "ldr q17, [x26, x23]\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q7, [SP, #0x0]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x70]\n"
- ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- "ldr q3, [SP, #0x20]\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x80]\n"
- ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- "ldr q31, [SP, #0x40]\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x90]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
- "ldr q6, [SP, #0x0]\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "ldr q26, [SP, #0x60]\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "movi v15.4s, #0x0\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
+ ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
+ ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
+ ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
+ ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x0]\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
+ ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x60]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0xa0]\n"
- ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x30]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "mov v17.16b, v15.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "ldr q30, [%x[params], #0x60]\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0xb0]\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
+ ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x20]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "movi v10.4s, #0x0\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- "ldr q29, [%x[params], #0xd0]\n"
- ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "ldr q2, [SP, #0x30]\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- "ldr q27, [%x[params], #0xe0]\n"
- ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- "ldr q28, [SP, #0x50]\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "ldr q25, [%x[params], #0xf0]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
- "ldr q8, [SP, #0x10]\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "ldr q24, [SP, #0x70]\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "movi v15.4s, #0x0\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
+ "ldr q3, [x9, x28]\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
+ "ldr q8, [x14, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0xc0]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
- "movi v10.4s, #0x0\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x100]\n"
- ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x90]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0xd0]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
+ ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x80]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
+ ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
+ "ldr q2, [x26, x28]\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
+ "ldr q7, [x13, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x120]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "mov v17.16b, v15.16b\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0xf0]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x130]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
+ ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x110]\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "ldr q30, [%x[params], #0xc0]\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "str s20, [x22, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
"ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x130]\n"
- ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x140]\n"
- ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x150]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "movi v15.4s, #0x0\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "movi v10.4s, #0x0\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x160]\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
+ "ldr q1, [x25, x28]\n"
+ ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
+ "ldr q6, [x12, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [x15, x28]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldr q29, [x15, x28]\n"
+ "ldr q28, [x14, x28]\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "mov v17.16b, v15.16b\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "ldr q27, [x13, x28]\n"
+ "ldr q26, [x12, x28]\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x170]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x150]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [x10, x28]\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldr q24, [x10, x28]\n"
+ "ldr q23, [x9, x28]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldr q22, [x26, x28]\n"
+ "ldr q21, [x25, x28]\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
"smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x170]\n"
+ "str s5, [x24, x27]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "ldr q30, [%x[params], #0x120]\n"
+ "str s25, [x22, x27]\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "str s20, [x21, x27]\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "add x27, x27, #0x4\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x140]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
"add %x[params], %x[params], #0x180\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ "bgt 1b\n"
+ "2:" // Detached iteration
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
+ ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ "tst %x[n_channels], #0xf\n"
+ ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
+ ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
+ ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
+ ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x0]\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
+ ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x60]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x30]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
+ ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x20]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0xc0]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x90]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0xd0]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
+ ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x80]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
+ ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x120]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0xf0]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x130]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
+ ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
+ ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str s5, [x24, x27]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "add x23, x23, #0x4\n"
- "bgt 1b\n"
- "tst %x[n_channels], #0xf\n"
- "beq 34f\n"
- "2:" // Oddments
- "and x19, %x[n_channels], #0xf\n"
- "add x13, x13, x23\n"
- "add x12, x12, x23\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "add x9, x9, x23\n"
- "add x28, x28, x23\n"
- "add x27, x27, x23\n"
- "add x26, x26, x23\n"
- "tbz %x[n_channels], #3, 6f\n"
- "ld1 { v27.d }[0], [x13], #0x8\n"
- "ld1 { v1.d }[0], [x12], #0x8\n"
- "ld1 { v25.d }[0], [x11], #0x8\n"
- "ld1 { v23.d }[0], [x10], #0x8\n"
- "ld1 { v31.d }[0], [x9], #0x8\n"
- "ld1 { v28.d }[0], [x28], #0x8\n"
- "ld1 { v21.d }[0], [x27], #0x8\n"
- "ld1 { v26.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 4f\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "add x27, x27, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d8, [x14], #0x8\n"
+ "ldr d7, [x13], #0x8\n"
+ "ldr d6, [x12], #0x8\n"
+ "ldr d4, [x10], #0x8\n"
+ "ldr d3, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d1, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v7.s }[2], [x13], #0x4\n"
+ "ld1 { v6.s }[2], [x12], #0x4\n"
+ "ld1 { v4.s }[2], [x10], #0x4\n"
+ "ld1 { v3.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.h }[6], [x15], #0x2\n"
+ "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v7.h }[6], [x13], #0x2\n"
+ "ld1 { v6.h }[6], [x12], #0x2\n"
+ "ld1 { v4.h }[6], [x10], #0x2\n"
+ "ld1 { v3.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[14], [x15], #0x1\n"
+ "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v7.b }[14], [x13], #0x1\n"
+ "ld1 { v6.b }[14], [x12], #0x1\n"
+ "ld1 { v4.b }[14], [x10], #0x1\n"
+ "ld1 { v3.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x25], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[12], [x15], #0x1\n"
+ "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v7.b }[12], [x13], #0x1\n"
+ "ld1 { v6.b }[12], [x12], #0x1\n"
+ "ld1 { v4.b }[12], [x10], #0x1\n"
+ "ld1 { v3.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x25], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.h }[4], [x15], #0x2\n"
+ "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v7.h }[4], [x13], #0x2\n"
+ "ld1 { v6.h }[4], [x12], #0x2\n"
+ "ld1 { v4.h }[4], [x10], #0x2\n"
+ "ld1 { v3.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[10], [x15], #0x1\n"
+ "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v7.b }[10], [x13], #0x1\n"
+ "ld1 { v6.b }[10], [x12], #0x1\n"
+ "ld1 { v4.b }[10], [x10], #0x1\n"
+ "ld1 { v3.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x25], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[8], [x15], #0x1\n"
+ "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v7.b }[8], [x13], #0x1\n"
+ "ld1 { v6.b }[8], [x12], #0x1\n"
+ "ld1 { v4.b }[8], [x10], #0x1\n"
+ "ld1 { v3.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x25], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s8, [x14], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
+ "ldr s6, [x12], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s1, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v9.h }[2], [x15], #0x2\n"
+ "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v7.h }[2], [x13], #0x2\n"
+ "ld1 { v6.h }[2], [x12], #0x2\n"
+ "ld1 { v4.h }[2], [x10], #0x2\n"
+ "ld1 { v3.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[6], [x15], #0x1\n"
+ "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v7.b }[6], [x13], #0x1\n"
+ "ld1 { v6.b }[6], [x12], #0x1\n"
+ "ld1 { v4.b }[6], [x10], #0x1\n"
+ "ld1 { v3.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x25], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[4], [x15], #0x1\n"
+ "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v7.b }[4], [x13], #0x1\n"
+ "ld1 { v6.b }[4], [x12], #0x1\n"
+ "ld1 { v4.b }[4], [x10], #0x1\n"
+ "ld1 { v3.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x25], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h9, [x15], #0x2\n"
+ "ldr h8, [x14], #0x2\n"
+ "ldr h7, [x13], #0x2\n"
+ "ldr h6, [x12], #0x2\n"
+ "ldr h4, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h1, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[2], [x15], #0x1\n"
+ "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v7.b }[2], [x13], #0x1\n"
+ "ld1 { v6.b }[2], [x12], #0x1\n"
+ "ld1 { v4.b }[2], [x10], #0x1\n"
+ "ld1 { v3.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x25], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b9, [x15], #0x1\n"
+ "ldr b8, [x14], #0x1\n"
+ "ldr b7, [x13], #0x1\n"
+ "ldr b6, [x12], #0x1\n"
+ "ldr b4, [x10], #0x1\n"
+ "ldr b3, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b1, [x25], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d29, [x15], #0x8\n"
+ "ldr d28, [x14], #0x8\n"
+ "ldr d27, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d22, [x26], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v29.s }[2], [x15], #0x4\n"
+ "ld1 { v28.s }[2], [x14], #0x4\n"
"ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v1.s }[2], [x12], #0x4\n"
- "ld1 { v25.s }[2], [x11], #0x4\n"
- "ld1 { v23.s }[2], [x10], #0x4\n"
- "ld1 { v31.s }[2], [x9], #0x4\n"
- "ld1 { v28.s }[2], [x28], #0x4\n"
- "ld1 { v21.s }[2], [x27], #0x4\n"
- "ld1 { v26.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 3f\n"
+ "ld1 { v26.s }[2], [x12], #0x4\n"
+ "ld1 { v24.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v22.s }[2], [x26], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v29.h }[6], [x15], #0x2\n"
+ "ld1 { v28.h }[6], [x14], #0x2\n"
"ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v1.h }[6], [x12], #0x2\n"
- "ld1 { v25.h }[6], [x11], #0x2\n"
- "ld1 { v23.h }[6], [x10], #0x2\n"
- "ld1 { v31.h }[6], [x9], #0x2\n"
- "ld1 { v28.h }[6], [x28], #0x2\n"
- "ld1 { v21.h }[6], [x27], #0x2\n"
- "ld1 { v26.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[6], [x12], #0x2\n"
+ "ld1 { v24.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v22.h }[6], [x26], #0x2\n"
+ "ld1 { v21.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[14], [x15], #0x1\n"
+ "ld1 { v28.b }[14], [x14], #0x1\n"
"ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v1.b }[14], [x12], #0x1\n"
- "ld1 { v25.b }[14], [x11], #0x1\n"
- "ld1 { v23.b }[14], [x10], #0x1\n"
- "ld1 { v31.b }[14], [x9], #0x1\n"
- "ld1 { v28.b }[14], [x28], #0x1\n"
- "ld1 { v21.b }[14], [x27], #0x1\n"
- "ld1 { v26.b }[14], [x26], #0x1\n"
- "b 10f\n"
- "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[14], [x12], #0x1\n"
+ "ld1 { v24.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v22.b }[14], [x26], #0x1\n"
+ "ld1 { v21.b }[14], [x25], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[12], [x15], #0x1\n"
+ "ld1 { v28.b }[12], [x14], #0x1\n"
"ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v1.b }[12], [x12], #0x1\n"
- "ld1 { v25.b }[12], [x11], #0x1\n"
- "ld1 { v23.b }[12], [x10], #0x1\n"
- "ld1 { v31.b }[12], [x9], #0x1\n"
- "ld1 { v28.b }[12], [x28], #0x1\n"
- "ld1 { v21.b }[12], [x27], #0x1\n"
- "ld1 { v26.b }[12], [x26], #0x1\n"
- "b 10f\n"
- "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v26.b }[12], [x12], #0x1\n"
+ "ld1 { v24.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v22.b }[12], [x26], #0x1\n"
+ "ld1 { v21.b }[12], [x25], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v29.h }[4], [x15], #0x2\n"
+ "ld1 { v28.h }[4], [x14], #0x2\n"
"ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v1.h }[4], [x12], #0x2\n"
- "ld1 { v25.h }[4], [x11], #0x2\n"
- "ld1 { v23.h }[4], [x10], #0x2\n"
- "ld1 { v31.h }[4], [x9], #0x2\n"
- "ld1 { v28.h }[4], [x28], #0x2\n"
- "ld1 { v21.h }[4], [x27], #0x2\n"
- "ld1 { v26.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[4], [x12], #0x2\n"
+ "ld1 { v24.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v22.h }[4], [x26], #0x2\n"
+ "ld1 { v21.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[10], [x15], #0x1\n"
+ "ld1 { v28.b }[10], [x14], #0x1\n"
"ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v1.b }[10], [x12], #0x1\n"
- "ld1 { v25.b }[10], [x11], #0x1\n"
- "ld1 { v23.b }[10], [x10], #0x1\n"
- "ld1 { v31.b }[10], [x9], #0x1\n"
- "ld1 { v28.b }[10], [x28], #0x1\n"
- "ld1 { v21.b }[10], [x27], #0x1\n"
- "ld1 { v26.b }[10], [x26], #0x1\n"
- "b 10f\n"
- "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[10], [x12], #0x1\n"
+ "ld1 { v24.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v22.b }[10], [x26], #0x1\n"
+ "ld1 { v21.b }[10], [x25], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[8], [x15], #0x1\n"
+ "ld1 { v28.b }[8], [x14], #0x1\n"
"ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v1.b }[8], [x12], #0x1\n"
- "ld1 { v25.b }[8], [x11], #0x1\n"
- "ld1 { v23.b }[8], [x10], #0x1\n"
- "ld1 { v31.b }[8], [x9], #0x1\n"
- "ld1 { v28.b }[8], [x28], #0x1\n"
- "ld1 { v21.b }[8], [x27], #0x1\n"
- "ld1 { v26.b }[8], [x26], #0x1\n"
- "b 10f\n"
- "6:" // Oddments: Load (A): Bit 3: Unset
- "tbz %x[n_channels], #2, 8f\n"
- "ld1 { v27.s }[0], [x13], #0x4\n"
- "ld1 { v1.s }[0], [x12], #0x4\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "ld1 { v23.s }[0], [x10], #0x4\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "ld1 { v21.s }[0], [x27], #0x4\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v26.b }[8], [x12], #0x1\n"
+ "ld1 { v24.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v22.b }[8], [x26], #0x1\n"
+ "ld1 { v21.b }[8], [x25], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s29, [x15], #0x4\n"
+ "ldr s28, [x14], #0x4\n"
+ "ldr s27, [x13], #0x4\n"
+ "ldr s26, [x12], #0x4\n"
+ "ldr s24, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s22, [x26], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v29.h }[2], [x15], #0x2\n"
+ "ld1 { v28.h }[2], [x14], #0x2\n"
"ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v1.h }[2], [x12], #0x2\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "ld1 { v21.h }[2], [x27], #0x2\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[2], [x12], #0x2\n"
+ "ld1 { v24.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v22.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[6], [x15], #0x1\n"
+ "ld1 { v28.b }[6], [x14], #0x1\n"
"ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v1.b }[6], [x12], #0x1\n"
- "ld1 { v25.b }[6], [x11], #0x1\n"
- "ld1 { v23.b }[6], [x10], #0x1\n"
- "ld1 { v31.b }[6], [x9], #0x1\n"
- "ld1 { v28.b }[6], [x28], #0x1\n"
- "ld1 { v21.b }[6], [x27], #0x1\n"
- "ld1 { v26.b }[6], [x26], #0x1\n"
- "b 10f\n"
- "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[6], [x12], #0x1\n"
+ "ld1 { v24.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v22.b }[6], [x26], #0x1\n"
+ "ld1 { v21.b }[6], [x25], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[4], [x15], #0x1\n"
+ "ld1 { v28.b }[4], [x14], #0x1\n"
"ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v1.b }[4], [x12], #0x1\n"
- "ld1 { v25.b }[4], [x11], #0x1\n"
- "ld1 { v23.b }[4], [x10], #0x1\n"
- "ld1 { v31.b }[4], [x9], #0x1\n"
- "ld1 { v28.b }[4], [x28], #0x1\n"
- "ld1 { v21.b }[4], [x27], #0x1\n"
- "ld1 { v26.b }[4], [x26], #0x1\n"
- "b 10f\n"
- "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 9f\n"
- "ld1 { v27.h }[0], [x13], #0x2\n"
- "ld1 { v1.h }[0], [x12], #0x2\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "ld1 { v23.h }[0], [x10], #0x2\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "ld1 { v21.h }[0], [x27], #0x2\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[4], [x12], #0x1\n"
+ "ld1 { v24.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v22.b }[4], [x26], #0x1\n"
+ "ld1 { v21.b }[4], [x25], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h29, [x15], #0x2\n"
+ "ldr h28, [x14], #0x2\n"
+ "ldr h27, [x13], #0x2\n"
+ "ldr h26, [x12], #0x2\n"
+ "ldr h24, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h22, [x26], #0x2\n"
+ "ldr h21, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[2], [x15], #0x1\n"
+ "ld1 { v28.b }[2], [x14], #0x1\n"
"ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v1.b }[2], [x12], #0x1\n"
- "ld1 { v25.b }[2], [x11], #0x1\n"
- "ld1 { v23.b }[2], [x10], #0x1\n"
- "ld1 { v31.b }[2], [x9], #0x1\n"
- "ld1 { v28.b }[2], [x28], #0x1\n"
- "ld1 { v21.b }[2], [x27], #0x1\n"
- "ld1 { v26.b }[2], [x26], #0x1\n"
- "b 10f\n"
- "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[0], [x13], #0x1\n"
- "ld1 { v1.b }[0], [x12], #0x1\n"
- "ld1 { v25.b }[0], [x11], #0x1\n"
- "ld1 { v23.b }[0], [x10], #0x1\n"
- "ld1 { v31.b }[0], [x9], #0x1\n"
- "ld1 { v28.b }[0], [x28], #0x1\n"
- "ld1 { v21.b }[0], [x27], #0x1\n"
- "ld1 { v26.b }[0], [x26], #0x1\n"
- "10:" // Oddments: Load (A): Bit 3: End
- "ldp x13, x12, [%x[inptrs], #0x40]\n"
- "add x13, x13, x23\n"
- "ldp x11, x10, [%x[inptrs], #0x50]\n"
- "ldp x9, x28, [%x[inptrs], #0x60]\n"
- "add x12, x12, x23\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "add x9, x9, x23\n"
- "add x28, x28, x23\n"
- "add x27, x27, x23\n"
- "add x26, x26, x23\n"
- "tbz %x[n_channels], #3, 14f\n"
- "ld1 { v24.d }[0], [x13], #0x8\n"
- "ld1 { v22.d }[0], [x12], #0x8\n"
- "ld1 { v20.d }[0], [x11], #0x8\n"
- "ld1 { v16.d }[0], [x10], #0x8\n"
- "ld1 { v19.d }[0], [x9], #0x8\n"
- "ld1 { v0.d }[0], [x28], #0x8\n"
- "ld1 { v18.d }[0], [x27], #0x8\n"
- "ld1 { v17.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 12f\n"
- "ld1 { v24.s }[2], [x13], #0x4\n"
- "ld1 { v22.s }[2], [x12], #0x4\n"
- "ld1 { v20.s }[2], [x11], #0x4\n"
- "ld1 { v16.s }[2], [x10], #0x4\n"
- "ld1 { v19.s }[2], [x9], #0x4\n"
- "ld1 { v0.s }[2], [x28], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v17.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 11f\n"
- "ld1 { v24.h }[6], [x13], #0x2\n"
- "ld1 { v22.h }[6], [x12], #0x2\n"
- "ld1 { v20.h }[6], [x11], #0x2\n"
- "ld1 { v16.h }[6], [x10], #0x2\n"
- "ld1 { v19.h }[6], [x9], #0x2\n"
- "ld1 { v0.h }[6], [x28], #0x2\n"
- "ld1 { v18.h }[6], [x27], #0x2\n"
- "ld1 { v17.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[14], [x13], #0x1\n"
- "ld1 { v22.b }[14], [x12], #0x1\n"
- "ld1 { v20.b }[14], [x11], #0x1\n"
- "ld1 { v16.b }[14], [x10], #0x1\n"
- "ld1 { v19.b }[14], [x9], #0x1\n"
- "ld1 { v0.b }[14], [x28], #0x1\n"
- "ld1 { v18.b }[14], [x27], #0x1\n"
- "ld1 { v17.b }[14], [x26], #0x1\n"
- "b 18f\n"
- "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[12], [x13], #0x1\n"
- "ld1 { v22.b }[12], [x12], #0x1\n"
- "ld1 { v20.b }[12], [x11], #0x1\n"
- "ld1 { v16.b }[12], [x10], #0x1\n"
- "ld1 { v19.b }[12], [x9], #0x1\n"
- "ld1 { v0.b }[12], [x28], #0x1\n"
- "ld1 { v18.b }[12], [x27], #0x1\n"
- "ld1 { v17.b }[12], [x26], #0x1\n"
- "b 18f\n"
- "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 13f\n"
- "ld1 { v24.h }[4], [x13], #0x2\n"
- "ld1 { v22.h }[4], [x12], #0x2\n"
- "ld1 { v20.h }[4], [x11], #0x2\n"
- "ld1 { v16.h }[4], [x10], #0x2\n"
- "ld1 { v19.h }[4], [x9], #0x2\n"
- "ld1 { v0.h }[4], [x28], #0x2\n"
- "ld1 { v18.h }[4], [x27], #0x2\n"
- "ld1 { v17.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[10], [x13], #0x1\n"
- "ld1 { v22.b }[10], [x12], #0x1\n"
- "ld1 { v20.b }[10], [x11], #0x1\n"
- "ld1 { v16.b }[10], [x10], #0x1\n"
- "ld1 { v19.b }[10], [x9], #0x1\n"
- "ld1 { v0.b }[10], [x28], #0x1\n"
- "ld1 { v18.b }[10], [x27], #0x1\n"
- "ld1 { v17.b }[10], [x26], #0x1\n"
- "b 18f\n"
- "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[8], [x13], #0x1\n"
- "ld1 { v22.b }[8], [x12], #0x1\n"
- "ld1 { v20.b }[8], [x11], #0x1\n"
- "ld1 { v16.b }[8], [x10], #0x1\n"
- "ld1 { v19.b }[8], [x9], #0x1\n"
- "ld1 { v0.b }[8], [x28], #0x1\n"
- "ld1 { v18.b }[8], [x27], #0x1\n"
- "ld1 { v17.b }[8], [x26], #0x1\n"
- "b 18f\n"
- "14:" // Oddments: Load (B): Bit 3: Unset
- "tbz %x[n_channels], #2, 16f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "ld1 { v22.s }[0], [x12], #0x4\n"
- "ld1 { v20.s }[0], [x11], #0x4\n"
- "ld1 { v16.s }[0], [x10], #0x4\n"
- "ld1 { v19.s }[0], [x9], #0x4\n"
- "ld1 { v0.s }[0], [x28], #0x4\n"
- "ld1 { v18.s }[0], [x27], #0x4\n"
- "ld1 { v17.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 15f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "ld1 { v22.h }[2], [x12], #0x2\n"
- "ld1 { v20.h }[2], [x11], #0x2\n"
- "ld1 { v16.h }[2], [x10], #0x2\n"
- "ld1 { v19.h }[2], [x9], #0x2\n"
- "ld1 { v0.h }[2], [x28], #0x2\n"
- "ld1 { v18.h }[2], [x27], #0x2\n"
- "ld1 { v17.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[6], [x13], #0x1\n"
- "ld1 { v22.b }[6], [x12], #0x1\n"
- "ld1 { v20.b }[6], [x11], #0x1\n"
- "ld1 { v16.b }[6], [x10], #0x1\n"
- "ld1 { v19.b }[6], [x9], #0x1\n"
- "ld1 { v0.b }[6], [x28], #0x1\n"
- "ld1 { v18.b }[6], [x27], #0x1\n"
- "ld1 { v17.b }[6], [x26], #0x1\n"
- "b 18f\n"
- "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[4], [x13], #0x1\n"
- "ld1 { v22.b }[4], [x12], #0x1\n"
- "ld1 { v20.b }[4], [x11], #0x1\n"
- "ld1 { v16.b }[4], [x10], #0x1\n"
- "ld1 { v19.b }[4], [x9], #0x1\n"
- "ld1 { v0.b }[4], [x28], #0x1\n"
- "ld1 { v18.b }[4], [x27], #0x1\n"
- "ld1 { v17.b }[4], [x26], #0x1\n"
- "b 18f\n"
- "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 17f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "ld1 { v22.h }[0], [x12], #0x2\n"
- "ld1 { v20.h }[0], [x11], #0x2\n"
- "ld1 { v16.h }[0], [x10], #0x2\n"
- "ld1 { v19.h }[0], [x9], #0x2\n"
- "ld1 { v0.h }[0], [x28], #0x2\n"
- "ld1 { v18.h }[0], [x27], #0x2\n"
- "ld1 { v17.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[2], [x13], #0x1\n"
- "ld1 { v22.b }[2], [x12], #0x1\n"
- "ld1 { v20.b }[2], [x11], #0x1\n"
- "ld1 { v16.b }[2], [x10], #0x1\n"
- "ld1 { v19.b }[2], [x9], #0x1\n"
- "ld1 { v0.b }[2], [x28], #0x1\n"
- "ld1 { v18.b }[2], [x27], #0x1\n"
- "ld1 { v17.b }[2], [x26], #0x1\n"
- "b 18f\n"
- "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[0], [x13], #0x1\n"
- "ld1 { v22.b }[0], [x12], #0x1\n"
- "ld1 { v20.b }[0], [x11], #0x1\n"
- "ld1 { v16.b }[0], [x10], #0x1\n"
- "ld1 { v19.b }[0], [x9], #0x1\n"
- "ld1 { v0.b }[0], [x28], #0x1\n"
- "ld1 { v18.b }[0], [x27], #0x1\n"
- "ld1 { v17.b }[0], [x26], #0x1\n"
- "18:" // Oddments: Load (B): Bit 3: End
- "zip1 v7.16b, v27.16b, v25.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "cmp x19, #0x4\n"
- "zip2 v5.16b, v27.16b, v25.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "zip1 v8.16b, v1.16b, v23.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "ld1 { v26.b }[2], [x12], #0x1\n"
+ "ld1 { v24.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v22.b }[2], [x26], #0x1\n"
+ "ld1 { v21.b }[2], [x25], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b29, [x15], #0x1\n"
+ "ldr b28, [x14], #0x1\n"
+ "ldr b27, [x13], #0x1\n"
+ "ldr b26, [x12], #0x1\n"
+ "ldr b24, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b22, [x26], #0x1\n"
+ "ldr b21, [x25], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
+ "ldr q0, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "cmp x20, #0x4\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
"add %x[params], %x[params], #0x60\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
- "zip1 v6.16b, v7.16b, v8.16b\n"
- "zip2 v8.16b, v7.16b, v8.16b\n"
- "zip1 v7.16b, v5.16b, v3.16b\n"
- "str q7, [SP, #0x0]\n"
- "zip2 v5.16b, v5.16b, v3.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "movi v15.4s, #0x0\n"
- ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- "movi v10.4s, #0x0\n"
- ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v22.4s, v22.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 19f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 20f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 21f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 22f\n"
- "19:" // Oddments: Unroll 0: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 20f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 21f\n"
- "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
-
- "22:" // Oddments: Unroll 0: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q30, [%x[params], #0x0]\n"
- ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "cmp x19, #0x4\n"
- "movi v10.4s, #0x0\n"
- "ldr q27, [%x[params], #0x20]\n"
- "ldr q25, [%x[params], #0x30]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n"
+ ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n"
+ "movi v17.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 23f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 24f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 25f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 26f\n"
- "23:" // Oddments: Unroll 1: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 24f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 25f\n"
- "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
-
- "26:" // Oddments: Unroll 1: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q6, [SP, #0x0]\n"
- "movi v10.4s, #0x0\n"
- "ldr q3, [SP, #0x20]\n"
- "cmp x19, #0x4\n"
- ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
- "ldr q31, [SP, #0x40]\n"
- "ldr q26, [SP, #0x60]\n"
- ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n"
+ ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n"
+ "movi v17.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 27f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 28f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 30f\n"
- "27:" // Oddments: Unroll 2: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 28f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 29f\n"
- "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
-
- "30:" // Oddments: Unroll 2: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q8, [SP, #0x10]\n"
- "movi v10.4s, #0x0\n"
- "ldr q2, [SP, #0x30]\n"
- "ldr q28, [SP, #0x50]\n"
- ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
- "ldr q24, [SP, #0x70]\n"
- "ldr q30, [%x[params], #0x0]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "ldr q21, [%x[params], #0x50]\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n"
+ ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n"
+ "movi v17.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n"
+ ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "31:" // Oddments: Unroll 3: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 32f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 33f\n"
- "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
-
- "34:" // End
- "add SP, SP, #0x80\n"
+ "32:" // Oddments: Unroll 3: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "35:" // End
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 71729e0d1e..d69d0e1ef2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,1072 +91,1072 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x7, x6, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v24.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v22.16b }, [x24]\n"
- "ld1r { v12.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v14.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v15.8h }, [x19]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v19.16b, v13.16b\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v11.16b, v26.16b\n"
- "mov v18.16b, v13.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v24.16b, v26.16b\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "mov x8, #0x0\n"
+ "mov x17, #0x0\n"
+ "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x12, x11, [x22, #0x0]\n"
+ "ldp x10, x9, [x22, #0x10]\n"
+ "cbz x7, 3f\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "subs x7, x7, #0x1\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
"mov v9.16b, v13.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v26.16b\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "ssubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d31, [x24, x8]\n"
+ "ldr d30, [x23, x8]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d29, [x22, x8]\n"
+ "ldr d28, [x21, x8]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ldr d27, [x20, x8]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "ssubl v27.8h, v27.8b, v24.8b\n"
"beq 2f\n"
"1:" // Loop
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "add x17, x17, #0x48\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "subs x16, x16, #0x1\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x15, x15, #0x48\n"
+ "subs x7, x7, #0x1\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "add x11, x11, #0x20\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "add x17, x17, #0x8\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "ssubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr d31, [x24, x8]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d30, [x23, x8]\n"
+ "ldr d29, [x22, x8]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ldr d28, [x21, x8]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d27, [x20, x8]\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "ssubl v27.8h, v27.8b, v24.8b\n"
"bgt 1b\n"
"2:" // Tail
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "tst x8, #0x7\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "tst x6, #0x7\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
- "add x11, x11, #0x20\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "add x17, x17, #0x8\n"
"beq 64f\n"
- "add x17, x17, #0x48\n"
+ "add x15, x15, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v13.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v26.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[2], [x19]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x6, #2, 5f\n"
+ "ld1 { v13.4s }, [x28], #0x10\n"
+ "tbz x6, #1, 4f\n"
+ "ld1 { v20.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[2], [x28]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[0], [x28]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "tbz x6, #1, 6f\n"
+ "ld1 { v13.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[2], [x28]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[0], [x28]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ldr x19, [x12, #0x20]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x23]\n"
- "ld1 { v30.b }[6], [x22]\n"
- "ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "add x24, x24, x8\n"
+ "add x23, x23, x8\n"
+ "add x22, x22, x8\n"
+ "add x21, x21, x8\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 9f\n"
+ "ld1 { v31.s }[0], [x24], #0x4\n"
+ "ld1 { v30.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 8f\n"
+ "ld1 { v31.h }[2], [x24], #0x2\n"
+ "ld1 { v30.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[6], [x24]\n"
+ "ld1 { v30.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x23]\n"
- "ld1 { v30.b }[4], [x22]\n"
- "ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[4], [x24]\n"
+ "ld1 { v30.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x23]\n"
- "ld1 { v30.b }[2], [x22]\n"
- "ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x6, #1, 10f\n"
+ "ld1 { v31.h }[0], [x24], #0x2\n"
+ "ld1 { v30.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[2], [x24]\n"
+ "ld1 { v30.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x23]\n"
- "ld1 { v30.b }[0], [x22]\n"
- "ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[0], [x24]\n"
+ "ld1 { v30.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v22.8b\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "add x21, x21, x15\n"
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "add x21, x21, x8\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ssubl v27.8h, v27.8b, v22.8b\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ssubl v27.8h, v27.8b, v24.8b\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "tbz x6, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x6, #1, 12f\n"
"ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[6], [x21]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[4], [x21]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x6, #1, 14f\n"
"ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[2], [x21]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr x20, [x12, #0x30]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x16, #0x30]\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "tbz x8, #2, 17f\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x8\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "tbz x6, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 16f\n"
+ "tbz x6, #1, 16f\n"
"ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 18f\n"
+ "tbz x6, #1, 18f\n"
"ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "add x26, x26, x15\n"
- "tbz x8, #2, 21f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x8, #1, 20f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "add x28, x28, x8\n"
+ "tbz x6, #2, 21f\n"
+ "ld1 { v28.s }[0], [x28], #0x4\n"
+ "tbz x6, #1, 20f\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[6], [x28]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[4], [x28]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x8, #1, 22f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x6, #1, 22f\n"
+ "ld1 { v28.h }[0], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[2], [x28]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[0], [x28]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "ldr x25, [x12, #0x40]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x27, [x16, #0x40]\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "add x25, x25, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v31.s }[0], [x25], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v31.h }[2], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[6], [x25]\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "add x27, x27, x8\n"
+ "tbz x6, #2, 25f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x6, #1, 24f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[4], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v31.h }[0], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[2], [x25]\n"
+ "tbz x6, #1, 26f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[0], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[0], [x27]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "ldr x19, [x12, #0x48]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x26, [x16, #0x48]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 29f\n"
- "ld1 { v30.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 28f\n"
- "ld1 { v30.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[6], [x19]\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "add x26, x26, x8\n"
+ "tbz x6, #2, 29f\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "tbz x6, #1, 28f\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[4], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x8, #1, 30f\n"
- "ld1 { v30.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[2], [x19]\n"
+ "tbz x6, #1, 30f\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[0], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ldr x24, [x12, #0x50]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "add x24, x24, x15\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "add x25, x25, x8\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "tbz x6, #2, 33f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x6, #1, 32f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x6, #1, 34f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "ldr x23, [x12, #0x58]\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x24, [x16, #0x58]\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "add x24, x24, x8\n"
+ "tbz x6, #2, 37f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x6, #1, 36f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x6, #1, 38f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "add x23, x23, x8\n"
+ "tbz x6, #2, 41f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "tbz x6, #1, 40f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x6, #1, 42f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[0], [x23]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v22.8b\n"
- "ldr x21, [x12, #0x68]\n"
+ "ssubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x22, [x16, #0x68]\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "add x22, x22, x8\n"
+ "tbz x6, #2, 45f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x6, #1, 44f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x6, #1, 46f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v30.8h, v30.8b, v22.8b\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ssubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "add x21, x21, x8\n"
+ "tbz x6, #2, 49f\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "tbz x6, #1, 48f\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x6, #1, 50f\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v29.8h, v29.8b, v22.8b\n"
- "ldr x19, [x12, #0x78]\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v28.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v28.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[6], [x19]\n"
+ "ssubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x20, [x16, #0x78]\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 53f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 52f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[4], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v28.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[2], [x19]\n"
+ "tbz x6, #1, 54f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[0], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v21.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x11], #0x10\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v10.d }[0], [x13], #0x8\n"
- "ld1 { v16.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x11]\n"
+ "ssubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "tbz x6, #2, 57f\n"
+ "ld1 { v17.4s }, [x14], #0x10\n"
+ "ld1 { v22.4s }, [x13], #0x10\n"
+ "tbz x6, #1, 56f\n"
+ "ld1 { v23.d }[0], [x14], #0x8\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[2], [x14]\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[0], [x13]\n"
- "ld1 { v16.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[0], [x14]\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v21.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x11]\n"
+ "tbz x6, #1, 58f\n"
+ "ld1 { v17.d }[0], [x14], #0x8\n"
+ "ld1 { v22.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[2], [x14]\n"
+ "ld1 { v22.s }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[0], [x14]\n"
+ "ld1 { v22.s }[0], [x13]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "add x12, x12, x17\n"
+ "add x11, x11, x17\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "add x10, x10, x17\n"
+ "add x9, x9, x17\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "tbz x8, #2, 61f\n"
- "st1 { v13.s }[0], [x10], #0x4\n"
- "st1 { v19.s }[0], [x9], #0x4\n"
- "st1 { v18.s }[0], [x28], #0x4\n"
- "st1 { v9.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 60f\n"
- "st1 { v13.h }[2], [x10], #0x2\n"
- "st1 { v19.h }[2], [x9], #0x2\n"
- "st1 { v18.h }[2], [x28], #0x2\n"
- "st1 { v9.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[6], [x10], #0x1\n"
- "st1 { v19.b }[6], [x9], #0x1\n"
- "st1 { v18.b }[6], [x28], #0x1\n"
- "st1 { v9.b }[6], [x27], #0x1\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "tbz x6, #2, 61f\n"
+ "st1 { v13.s }[0], [x12], #0x4\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v16.s }[0], [x10], #0x4\n"
+ "st1 { v25.s }[0], [x9], #0x4\n"
+ "tbz x6, #1, 60f\n"
+ "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v16.h }[2], [x10], #0x2\n"
+ "st1 { v25.h }[2], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v16.b }[6], [x10], #0x1\n"
+ "st1 { v25.b }[6], [x9], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[4], [x10], #0x1\n"
- "st1 { v19.b }[4], [x9], #0x1\n"
- "st1 { v18.b }[4], [x28], #0x1\n"
- "st1 { v9.b }[4], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[4], [x12], #0x1\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v16.b }[4], [x10], #0x1\n"
+ "st1 { v25.b }[4], [x9], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "st1 { v13.h }[0], [x10], #0x2\n"
- "st1 { v19.h }[0], [x9], #0x2\n"
- "st1 { v18.h }[0], [x28], #0x2\n"
- "st1 { v9.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[2], [x10], #0x1\n"
- "st1 { v19.b }[2], [x9], #0x1\n"
- "st1 { v18.b }[2], [x28], #0x1\n"
- "st1 { v9.b }[2], [x27], #0x1\n"
+ "tbz x6, #1, 62f\n"
+ "st1 { v13.h }[0], [x12], #0x2\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v16.h }[0], [x10], #0x2\n"
+ "st1 { v25.h }[0], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v16.b }[2], [x10], #0x1\n"
+ "st1 { v25.b }[2], [x9], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[0], [x10], #0x1\n"
- "st1 { v19.b }[0], [x9], #0x1\n"
- "st1 { v18.b }[0], [x28], #0x1\n"
- "st1 { v9.b }[0], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[0], [x12], #0x1\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v16.b }[0], [x10], #0x1\n"
+ "st1 { v25.b }[0], [x9], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 0dc377c5c1..fa9ae97dee 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,75 +100,75 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.16b }, [x24]\n"
- "ld1r { v13.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v14.8h }, [x19]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v9.16b, v15.16b\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v16.16b, v10.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v21.16b, v10.16b\n"
- "mov v23.16b, v15.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v18.16b, v10.16b\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d31, [x27, x17]\n"
+ "ldr d30, [x26, x17]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr d28, [x24, x17]\n"
"ssubl v31.8h, v31.8b, v12.8b\n"
"ssubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d27, [x23, x17]\n"
+ "ldr d26, [x22, x17]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr d24, [x20, x17]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
@@ -176,250 +176,250 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x14, x14, #0x48\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "add x17, x17, #0x48\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "subs x16, x16, #0x1\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ "subs x8, x8, #0x1\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "add x11, x11, #0x20\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d31, [x27, x17]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d30, [x26, x17]\n"
+ "ldr d29, [x25, x17]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr d28, [x24, x17]\n"
+ "ldr d27, [x23, x17]\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
"ssubl v31.8h, v31.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr d25, [x21, x17]\n"
"ssubl v30.8h, v30.8b, v12.8b\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d24, [x20, x17]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
@@ -428,966 +428,966 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "tst x7, #0x7\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "tst x8, #0x7\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x11, x11, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 88f\n"
- "add x17, x17, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v15.4s }, [x24], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v17.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[0], [x24]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v15.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[2], [x24]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[0], [x24]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x26, x25, [x12, #0x0]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x24, x23, [x12, #0x10]\n"
- "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "add x26, x26, x15\n"
- "add x25, x25, x15\n"
- "add x24, x24, x15\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "ld1 { v30.s }[0], [x25], #0x4\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x26]\n"
- "ld1 { v30.b }[6], [x25]\n"
- "ld1 { v29.b }[6], [x24]\n"
- "ld1 { v28.b }[6], [x23]\n"
- "ld1 { v27.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x26]\n"
- "ld1 { v30.b }[4], [x25]\n"
- "ld1 { v29.b }[4], [x24]\n"
- "ld1 { v28.b }[4], [x23]\n"
- "ld1 { v27.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "ld1 { v30.h }[0], [x25], #0x2\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x26]\n"
- "ld1 { v30.b }[2], [x25]\n"
- "ld1 { v29.b }[2], [x24]\n"
- "ld1 { v28.b }[2], [x23]\n"
- "ld1 { v27.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x26]\n"
- "ld1 { v30.b }[0], [x25]\n"
- "ld1 { v29.b }[0], [x24]\n"
- "ld1 { v28.b }[0], [x23]\n"
- "ld1 { v27.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v25.b }[0], [x20]\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"ssubl v31.8h, v31.8b, v12.8b\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
"ssubl v30.8h, v30.8b, v12.8b\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "add x24, x24, x15\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "add x24, x24, x17\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
"ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "tbz x7, #2, 13f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x7, #1, 12f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x7, #1, 14f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 17f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 16f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x8, #1, 18f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x12, #0x50]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 21f\n"
+ "ldr x21, [x15, #0x50]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 20f\n"
+ "tbz x7, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x8, #1, 22f\n"
+ "tbz x7, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x19, [x12, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v26.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v26.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[6], [x19]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[4], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v26.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[2], [x19]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[0], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
"ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 29f\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 28f\n"
+ "tbz x7, #1, 28f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 30f\n"
+ "tbz x7, #1, 30f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
"ssubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0x70]\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
"ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x12, #0x78]\n"
+ "ldr x23, [x15, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x12, #0x80]\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 45f\n"
"ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
+ "tbz x7, #1, 44f\n"
"ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
+ "tbz x7, #1, 46f\n"
"ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
"ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr x20, [x12, #0x88]\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
"ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[6], [x23]\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "add x24, x24, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v25.s }[0], [x24], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[6], [x24]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[4], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[4], [x24]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[2], [x23]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v25.h }[0], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[2], [x24]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[0], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[0], [x24]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
"ssubl v25.8h, v25.8b, v12.8b\n"
- "ldr x24, [x12, #0x98]\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "add x24, x24, x15\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "add x25, x25, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 61f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 60f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x7, #1, 62f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
"ssubl v27.8h, v27.8b, v12.8b\n"
- "ldr x22, [x12, #0xa8]\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 65f\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 64f\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[6], [x22]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[6], [x23]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[4], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[4], [x23]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 66f\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[2], [x22]\n"
+ "tbz x7, #1, 66f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[2], [x23]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[0], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[0], [x23]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
"ssubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x12, #0xb0]\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 69f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 68f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x8, #1, 70f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x7, #1, 70f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
"ssubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 73f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 72f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[6], [x21]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[4], [x21]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x8, #1, 74f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x7, #1, 74f\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[2], [x21]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[0], [x21]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
"ssubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0xc0]\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 77f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 76f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x8, #1, 78f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 78f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "tbz x8, #2, 81f\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "tbz x7, #2, 81f\n"
"ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v0.4s }, [x11], #0x10\n"
- "tbz x8, #1, 80f\n"
- "ld1 { v4.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x11]\n"
+ "ld1 { v18.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v30.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x11]\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 82f\n"
+ "tbz x7, #1, 82f\n"
"ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v0.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
+ "ld1 { v18.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[2], [x13]\n"
- "ld1 { v0.s }[2], [x11]\n"
+ "ld1 { v18.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 83f\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[0], [x13]\n"
- "ld1 { v0.s }[0], [x11]\n"
+ "ld1 { v18.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
"sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x8, #2, 85f\n"
- "st1 { v15.s }[0], [x10], #0x4\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v15.s }[0], [x11], #0x4\n"
+ "st1 { v10.s }[0], [x10], #0x4\n"
"st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v22.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 84f\n"
- "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v21.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "st1 { v10.h }[2], [x10], #0x2\n"
"st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v22.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v21.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[6], [x11], #0x1\n"
+ "st1 { v10.b }[6], [x10], #0x1\n"
"st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v22.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "st1 { v21.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[4], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[4], [x11], #0x1\n"
+ "st1 { v10.b }[4], [x10], #0x1\n"
"st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v22.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "st1 { v21.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 86f\n"
- "st1 { v15.h }[0], [x10], #0x2\n"
+ "tbz x7, #1, 86f\n"
+ "st1 { v15.h }[0], [x11], #0x2\n"
+ "st1 { v10.h }[0], [x10], #0x2\n"
"st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v22.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v21.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[2], [x11], #0x1\n"
+ "st1 { v10.b }[2], [x10], #0x1\n"
"st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v22.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "st1 { v21.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[0], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[0], [x11], #0x1\n"
+ "st1 { v10.b }[0], [x10], #0x1\n"
"st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v22.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "st1 { v21.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 663ea59a98..4b0ad00187 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,2073 +111,2073 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
- "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
- "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
- "add x14, x10, %[offsetof_Requantize32_minval]\n"
- "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
- "add x5, x10, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v9.16b }, [x17]\n"
- "ld1r { v14.16b }, [x9]\n"
- "lsr x3, x0, #0x3\n"
- "ld1r { v18.8h }, [x4]\n"
- "ld1r { v11.8h }, [x14]\n"
- "mov x24, #0x0\n"
- "mov x22, #0x0\n"
- "ld1r { v13.8h }, [x5]\n"
- "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x20, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x16, x8, [x25, #0x0]\n"
- "ldp x4, x7, [x25, #0x10]\n"
- "cbz x3, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x3, x3, #0x1\n"
- "mov v17.16b, v15.16b\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v8.16b, v16.16b\n"
- "mov v10.16b, v15.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v7.16b, v16.16b\n"
- "mov v6.16b, v15.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v5.16b, v16.16b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v9.16b }, [x3]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
+ "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x11]\n"
+ "ld1r { v14.8h }, [x5]\n"
+ "add x3, x13, %[offsetof_Requantize32_minval]\n"
+ "add x15, x13, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x3]\n"
+ "ld1r { v11.8h }, [x15]\n"
+ "mov x0, #0x0\n"
+ "mov x10, #0x0\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x6, [x24, #0x0]\n"
+ "ldp x7, x16, [x24, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d31, [x9, x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d30, [x28, x0]\n"
+ "ldr d29, [x27, x0]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d28, [x26, x0]\n"
+ "ldr d27, [x25, x0]\n"
"ssubl v29.8h, v29.8b, v9.8b\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d23, [x24, x0]\n"
+ "ldr d25, [x23, x0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d24, [x22, x0]\n"
+ "ldr d26, [x21, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
+ "ldr d22, [x20, x0]\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr q12, [x10, #0x0]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q19, [x1, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q29, [x1, #0x10]\n"
- "subs x3, x3, #0x1\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "add x10, x10, #0x20\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x14, [x20, #0x110]\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "add x23, x23, #0xc8\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "add x3, x3, #0xc8\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x22, x22, #0x8\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "add x10, x10, #0x8\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d31, [x9, x0]\n"
+ "ldr d30, [x28, x0]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d29, [x27, x0]\n"
+ "ldr d28, [x26, x0]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d27, [x25, x0]\n"
+ "ldr d23, [x24, x0]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
"ssubl v29.8h, v29.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d25, [x23, x0]\n"
+ "ldr d24, [x22, x0]\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d26, [x21, x0]\n"
+ "ldr d22, [x20, x0]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q12, [x10, #0x0]\n"
- "ldr q19, [x1, #0x0]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q20, [x10, #0x10]\n"
- "ldr q29, [x1, #0x10]\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "tst x0, #0x7\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x10, x10, #0x20\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "tst x1, #0x7\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "add x22, x22, #0x8\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "add x10, x10, #0x8\n"
"beq 124f\n"
- "add x23, x23, #0xc8\n"
+ "add x3, x3, #0xc8\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x0, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x0, #1, 4f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v13.4s }, [x13], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x0, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x1, #1, 6f\n"
+ "ld1 { v13.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[2], [x13]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[0], [x13]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d2, [x23, #0x10]\n"
- "ldr d3, [x23, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldr d4, [x23, #0x20]\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "add x28, x28, x24\n"
- "add x6, x6, x24\n"
- "add x26, x26, x24\n"
- "add x25, x25, x24\n"
- "add x5, x5, x24\n"
- "add x2, x2, x24\n"
- "add x27, x27, x24\n"
- "add x21, x21, x24\n"
- "add x12, x12, x24\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 9f\n"
- "ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x6], #0x4\n"
- "ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x25], #0x4\n"
- "ld1 { v27.s }[0], [x5], #0x4\n"
- "ld1 { v23.s }[0], [x2], #0x4\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x12], #0x4\n"
- "ld1 { v22.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 8f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x6], #0x2\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x5], #0x2\n"
- "ld1 { v23.h }[2], [x2], #0x2\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v22.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x6]\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x5]\n"
- "ld1 { v23.b }[6], [x2]\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x12]\n"
- "ld1 { v22.b }[6], [x19]\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "add x9, x9, x0\n"
+ "add x28, x28, x0\n"
+ "add x27, x27, x0\n"
+ "add x26, x26, x0\n"
+ "add x25, x25, x0\n"
+ "add x24, x24, x0\n"
+ "add x23, x23, x0\n"
+ "add x22, x22, x0\n"
+ "add x21, x21, x0\n"
+ "add x20, x20, x0\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x27], #0x4\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x6]\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x5]\n"
- "ld1 { v23.b }[4], [x2]\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x12]\n"
- "ld1 { v22.b }[4], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x0, #1, 10f\n"
- "ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x6], #0x2\n"
- "ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x25], #0x2\n"
- "ld1 { v27.h }[0], [x5], #0x2\n"
- "ld1 { v23.h }[0], [x2], #0x2\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x12], #0x2\n"
- "ld1 { v22.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x6]\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x5]\n"
- "ld1 { v23.b }[2], [x2]\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x12]\n"
- "ld1 { v22.b }[2], [x19]\n"
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x27], #0x2\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x6]\n"
- "ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x25]\n"
- "ld1 { v27.b }[0], [x5]\n"
- "ld1 { v23.b }[0], [x2]\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x12]\n"
- "ld1 { v22.b }[0], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x27]\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v22.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"ssubl v31.8h, v31.8b, v9.8b\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x19, [x20, #0x50]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x4, #0x50]\n"
"ssubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "add x19, x19, x24\n"
+ "add x20, x20, x0\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "tbz x0, #2, 13f\n"
- "ld1 { v31.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 12f\n"
- "ld1 { v31.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[6], [x19]\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[4], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x0, #1, 14f\n"
- "ld1 { v31.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[2], [x19]\n"
+ "tbz x1, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[0], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x58]\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "add x15, x15, x24\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "add x22, x22, x0\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "tbz x0, #2, 17f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 16f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x0, #1, 18f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 18f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ssubl v30.8h, v30.8b, v9.8b\n"
- "ldr x19, [x20, #0x60]\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 21f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 20f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x0, #1, 22f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d0, [x3, #0x28]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr d0, [x23, #0x28]\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x27, [x20, #0x68]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
"smlal2 v7.4s, v22.8h, v0.8h\n"
- "tbz x0, #2, 25f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 24f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[6], [x27]\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[4], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x0, #1, 26f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[2], [x27]\n"
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[0], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x23, #0x30]\n"
+ "ldr d1, [x3, #0x30]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x5, [x20, #0x70]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
"smlal2 v7.4s, v25.8h, v1.8h\n"
- "tbz x0, #2, 29f\n"
- "ld1 { v24.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 28f\n"
- "ld1 { v24.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[6], [x5]\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[4], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x0, #1, 30f\n"
- "ld1 { v24.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[2], [x5]\n"
+ "tbz x1, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[0], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x23, #0x38]\n"
+ "ldr d2, [x3, #0x38]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
"smlal2 v7.4s, v24.8h, v2.8h\n"
- "tbz x0, #2, 33f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 32f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x0, #1, 34f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "tbz x1, #1, 34f\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[0], [x25]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x23, #0x40]\n"
+ "ldr d3, [x3, #0x40]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x12, [x20, #0x80]\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
"smlal2 v7.4s, v27.8h, v3.8h\n"
- "tbz x0, #2, 37f\n"
- "ld1 { v23.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 36f\n"
- "ld1 { v23.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[6], [x12]\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[6], [x23]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[4], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[4], [x23]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x0, #1, 38f\n"
- "ld1 { v23.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[2], [x12]\n"
+ "tbz x1, #1, 38f\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[2], [x23]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[0], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[0], [x23]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x23, #0x48]\n"
+ "ldr d4, [x3, #0x48]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x26, [x20, #0x88]\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x24, [x4, #0x88]\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "add x24, x24, x0\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "tbz x0, #2, 41f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 40f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x0, #1, 42f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x1, #1, 42f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x23, #0x50]\n"
+ "ldr d0, [x3, #0x50]\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x14, [x20, #0x90]\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "tbz x0, #2, 45f\n"
- "ld1 { v31.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 44f\n"
- "ld1 { v31.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[6], [x14]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x15]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[4], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x15]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x0, #1, 46f\n"
- "ld1 { v31.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[2], [x14]\n"
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x15]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[0], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x15]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
"ssubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
"smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x15, x15, x24\n"
- "tbz x0, #2, 49f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 48f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x0, #1, 50f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x23, #0x58]\n"
+ "ldr d1, [x3, #0x58]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x21, [x20, #0xa0]\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x0\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
"smlal2 v7.4s, v30.8h, v1.8h\n"
- "tbz x0, #2, 53f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 52f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x0, #1, 54f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x1, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x23, #0x60]\n"
+ "ldr d2, [x3, #0x60]\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x2, [x20, #0xa8]\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "add x2, x2, x24\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "add x13, x13, x0\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
"smlal2 v7.4s, v26.8h, v2.8h\n"
- "tbz x0, #2, 57f\n"
- "ld1 { v25.s }[0], [x2], #0x4\n"
- "tbz x0, #1, 56f\n"
- "ld1 { v25.h }[2], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[6], [x2]\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v25.s }[0], [x13], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v25.h }[2], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[6], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[4], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[4], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x0, #1, 58f\n"
- "ld1 { v25.h }[0], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[2], [x2]\n"
+ "tbz x1, #1, 58f\n"
+ "ld1 { v25.h }[0], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[0], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[0], [x13]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x23, #0x68]\n"
+ "ldr d3, [x3, #0x68]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x13, [x20, #0xb0]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x13, x13, x24\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x12, x12, x0\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 61f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x0, #1, 60f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v24.s }[0], [x12], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v24.h }[2], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[6], [x12]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[4], [x12]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x0, #1, 62f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x1, #1, 62f\n"
+ "ld1 { v24.h }[0], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[2], [x12]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[0], [x12]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x23, #0x70]\n"
+ "ldr d4, [x3, #0x70]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x9, x9, x24\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 65f\n"
- "ld1 { v22.s }[0], [x9], #0x4\n"
- "tbz x0, #1, 64f\n"
- "ld1 { v22.h }[2], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[6], [x9]\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[4], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x0, #1, 66f\n"
- "ld1 { v22.h }[0], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[2], [x9]\n"
+ "tbz x1, #1, 66f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[0], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x23, #0x78]\n"
+ "ldr d0, [x3, #0x78]\n"
"ssubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x19, [x20, #0xc0]\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "tbz x0, #2, 69f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 68f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "add x11, x11, x0\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x0, #1, 70f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 70f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"ssubl v27.8h, v27.8b, v9.8b\n"
- "ldr x28, [x20, #0xc8]\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
"smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x28, x28, x24\n"
- "tbz x0, #2, 73f\n"
- "ld1 { v23.s }[0], [x28], #0x4\n"
- "tbz x0, #1, 72f\n"
- "ld1 { v23.h }[2], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[6], [x28]\n"
+ "add x22, x22, x0\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v23.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[6], [x22]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[4], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[4], [x22]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x0, #1, 74f\n"
- "ld1 { v23.h }[0], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[2], [x28]\n"
+ "tbz x1, #1, 74f\n"
+ "ld1 { v23.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[2], [x22]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[0], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[0], [x22]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x23, #0x80]\n"
+ "ldr d1, [x3, #0x80]\n"
"ssubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x6, [x20, #0xd0]\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "add x6, x6, x24\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "add x9, x9, x0\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
"smlal2 v7.4s, v23.8h, v1.8h\n"
- "tbz x0, #2, 77f\n"
- "ld1 { v31.s }[0], [x6], #0x4\n"
- "tbz x0, #1, 76f\n"
- "ld1 { v31.h }[2], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[6], [x6]\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[6], [x9]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[4], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[4], [x9]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x0, #1, 78f\n"
- "ld1 { v31.h }[0], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[2], [x6]\n"
+ "tbz x1, #1, 78f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[2], [x9]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[0], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[0], [x9]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x23, #0x88]\n"
+ "ldr d2, [x3, #0x88]\n"
"ssubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x0\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
"smlal2 v7.4s, v31.8h, v2.8h\n"
- "tbz x0, #2, 81f\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 80f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x0, #1, 82f\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "tbz x1, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x23, #0x90]\n"
+ "ldr d3, [x3, #0x90]\n"
"ssubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x11, [x20, #0xe0]\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "add x27, x27, x0\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
"smlal2 v7.4s, v30.8h, v3.8h\n"
- "tbz x0, #2, 85f\n"
- "ld1 { v28.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 84f\n"
- "ld1 { v28.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[6], [x11]\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[6], [x27]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[4], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[4], [x27]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x0, #1, 86f\n"
- "ld1 { v28.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[2], [x11]\n"
+ "tbz x1, #1, 86f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[2], [x27]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[0], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[0], [x27]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x23, #0x98]\n"
+ "ldr d4, [x3, #0x98]\n"
"ssubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "add x17, x17, x24\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "tbz x0, #2, 89f\n"
- "ld1 { v26.s }[0], [x17], #0x4\n"
- "tbz x0, #1, 88f\n"
- "ld1 { v26.h }[2], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[6], [x17]\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[6], [x26]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[4], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[4], [x26]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x0, #1, 90f\n"
- "ld1 { v26.h }[0], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[2], [x17]\n"
+ "tbz x1, #1, 90f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[2], [x26]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[0], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[0], [x26]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x23, #0xa0]\n"
+ "ldr d0, [x3, #0xa0]\n"
"ssubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x5, [x20, #0xf0]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "tbz x0, #2, 93f\n"
- "ld1 { v25.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 92f\n"
- "ld1 { v25.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[6], [x5]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[6], [x25]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[4], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[4], [x25]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x0, #1, 94f\n"
- "ld1 { v25.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[2], [x5]\n"
+ "tbz x1, #1, 94f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[2], [x25]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[0], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[0], [x25]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
"ssubl v25.8h, v25.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x25, x25, x24\n"
- "tbz x0, #2, 97f\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "tbz x0, #1, 96f\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[6], [x25]\n"
+ "add x24, x24, x0\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[6], [x24]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[4], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[4], [x24]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x0, #1, 98f\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "tbz x1, #1, 98f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[2], [x24]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[0], [x24]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x23, #0xa8]\n"
+ "ldr d1, [x3, #0xa8]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "tbz x0, #2, 101f\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 100f\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[6], [x26]\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[4], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x0, #1, 102f\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[2], [x26]\n"
+ "tbz x1, #1, 102f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[0], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x23, #0xb0]\n"
+ "ldr d2, [x3, #0xb0]\n"
"ssubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "tbz x0, #2, 105f\n"
- "ld1 { v25.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 104f\n"
- "ld1 { v25.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[6], [x12]\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v25.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v25.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[6], [x15]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[4], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[4], [x15]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x0, #1, 106f\n"
- "ld1 { v25.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[2], [x12]\n"
+ "tbz x1, #1, 106f\n"
+ "ld1 { v25.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[2], [x15]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[0], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[0], [x15]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x23, #0xb8]\n"
+ "ldr d3, [x3, #0xb8]\n"
"ssubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x21, x21, x0\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 109f\n"
- "ld1 { v24.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 108f\n"
- "ld1 { v24.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[6], [x14]\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[4], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x0, #1, 110f\n"
- "ld1 { v24.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[2], [x14]\n"
+ "tbz x1, #1, 110f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[0], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x23, #0xc0]\n"
+ "ldr d4, [x3, #0xc0]\n"
"ssubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 113f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 112f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x0, #1, 114f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "tbz x1, #1, 114f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
"ssubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x0, #2, 117f\n"
- "ld1 { v12.4s }, [x10], #0x10\n"
- "ld1 { v19.4s }, [x1], #0x10\n"
- "tbz x0, #1, 116f\n"
- "ld1 { v20.d }[0], [x10], #0x8\n"
- "ld1 { v29.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[2], [x10]\n"
- "ld1 { v29.s }[2], [x1]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v18.4s }, [x5], #0x10\n"
+ "ld1 { v6.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v5.d }[0], [x5], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[2], [x5]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[0], [x10]\n"
- "ld1 { v29.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[0], [x5]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x0, #1, 118f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
- "ld1 { v19.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[2], [x10]\n"
- "ld1 { v19.s }[2], [x1]\n"
+ "tbz x1, #1, 118f\n"
+ "ld1 { v18.d }[0], [x5], #0x8\n"
+ "ld1 { v6.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[2], [x5]\n"
+ "ld1 { v6.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[0], [x10]\n"
- "ld1 { v19.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[0], [x5]\n"
+ "ld1 { v6.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "add x16, x16, x22\n"
- "add x8, x8, x22\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "add x4, x4, x22\n"
- "add x7, x7, x22\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "add x17, x17, x10\n"
+ "add x6, x6, x10\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x7, x7, x10\n"
+ "add x16, x16, x10\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x0, #2, 121f\n"
- "st1 { v15.s }[0], [x16], #0x4\n"
- "st1 { v17.s }[0], [x8], #0x4\n"
- "st1 { v10.s }[0], [x4], #0x4\n"
- "st1 { v6.s }[0], [x7], #0x4\n"
- "tbz x0, #1, 120f\n"
- "st1 { v15.h }[2], [x16], #0x2\n"
- "st1 { v17.h }[2], [x8], #0x2\n"
- "st1 { v10.h }[2], [x4], #0x2\n"
- "st1 { v6.h }[2], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[6], [x16], #0x1\n"
- "st1 { v17.b }[6], [x8], #0x1\n"
- "st1 { v10.b }[6], [x4], #0x1\n"
- "st1 { v6.b }[6], [x7], #0x1\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v13.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x6], #0x4\n"
+ "st1 { v8.s }[0], [x7], #0x4\n"
+ "st1 { v17.s }[0], [x16], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v13.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x6], #0x2\n"
+ "st1 { v8.h }[2], [x7], #0x2\n"
+ "st1 { v17.h }[2], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x6], #0x1\n"
+ "st1 { v8.b }[6], [x7], #0x1\n"
+ "st1 { v17.b }[6], [x16], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[4], [x16], #0x1\n"
- "st1 { v17.b }[4], [x8], #0x1\n"
- "st1 { v10.b }[4], [x4], #0x1\n"
- "st1 { v6.b }[4], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x6], #0x1\n"
+ "st1 { v8.b }[4], [x7], #0x1\n"
+ "st1 { v17.b }[4], [x16], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x0, #1, 122f\n"
- "st1 { v15.h }[0], [x16], #0x2\n"
- "st1 { v17.h }[0], [x8], #0x2\n"
- "st1 { v10.h }[0], [x4], #0x2\n"
- "st1 { v6.h }[0], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[2], [x16], #0x1\n"
- "st1 { v17.b }[2], [x8], #0x1\n"
- "st1 { v10.b }[2], [x4], #0x1\n"
- "st1 { v6.b }[2], [x7], #0x1\n"
+ "tbz x1, #1, 122f\n"
+ "st1 { v13.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x6], #0x2\n"
+ "st1 { v8.h }[0], [x7], #0x2\n"
+ "st1 { v17.h }[0], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x6], #0x1\n"
+ "st1 { v8.b }[2], [x7], #0x1\n"
+ "st1 { v17.b }[2], [x16], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[0], [x16], #0x1\n"
- "st1 { v17.b }[0], [x8], #0x1\n"
- "st1 { v10.b }[0], [x4], #0x1\n"
- "st1 { v6.b }[0], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x6], #0x1\n"
+ "st1 { v8.b }[0], [x7], #0x1\n"
+ "st1 { v17.b }[0], [x16], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 78f748ad58..3f345cf95a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,583 +41,577 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v12.4s }, [x19]\n"
+ "lsr x12, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ld1r { v10.16b }, [x19]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v9.16b }, [x20]\n"
- "ld1r { v8.4s }, [x19]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v7.4s }, [x20]\n"
- "ld1r { v6.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "ld1r { v5.4s }, [x19]\n"
- "lsr x10, %x[n_channels], #0x2\n"
- "cbz x10, 6f\n"
+ "cbz x12, 6f\n"
"1:" // Channel loop
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q27, [%x[bias], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, %x[n_points], #0x1\n"
- "mov v24.16b, v27.16b\n"
- "ldr s4, [x9, x11]\n"
- "mov v23.16b, v27.16b\n"
- "mov v22.16b, v27.16b\n"
- "ldr s3, [x28, x11]\n"
- "mov v21.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
- "ldr s2, [x27, x11]\n"
- "mov v19.16b, v27.16b\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
- "ldr s1, [x26, x11]\n"
- "ssubl v4.8h, v4.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ssubl v3.8h, v3.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "ssubl v2.8h, v2.8b, v10.8b\n"
- "ssubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ssubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "ssubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "ssubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "ssubl v29.8h, v29.8b, v10.8b\n"
- "ssubl v28.8h, v28.8b, v10.8b\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldr s16, [x28, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x27, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x26, x11]\n"
+ "ldr s19, [x25, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr s20, [x24, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldr s4, [x9, x11]\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "ldr s3, [x28, x11]\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr s2, [x27, x11]\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
- "ssubl v4.8h, v4.8b, v10.8b\n"
- "ldr s1, [x26, x11]\n"
- "ssubl v3.8h, v3.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "ssubl v2.8h, v2.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "ssubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "ssubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "ssubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "ssubl v29.8h, v29.8b, v10.8b\n"
- "ssubl v28.8h, v28.8b, v10.8b\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x28, x11]\n"
+ "ldr s17, [x27, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldr s18, [x26, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x25, x11]\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x20, x20, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x24, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 5f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q6, [%x[rq_mul_ptr], x19]\n"
- "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 5f\n"
- "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
"5:" // Channel loop: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "and v16.16b, v27.16b, v5.16b\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x27, x11]\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x26, x11]\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x25, x11]\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x24, x11]\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x23, x11]\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x22, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x21, x11]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x20, x11]\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x19, x11]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x10, LSL #2\n"
+ "cmp x11, x12, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 24f\n"
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 9f\n"
- "add x19, %x[bias], x11, LSL #2\n"
+ "add x20, %x[bias], x11, LSL #2\n"
"tbz %x[n_channels], #1, 7f\n"
- "ld1 { v27.d }[0], [x19], #0x8\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[2], [x19], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"b 8f\n"
"7:" // Oddments: Load bias: Bit 1: Unset
- "tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"8:" // Oddments: Load bias: Bit 1: End
-
"9:" // Oddments: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr x22, [x21], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v24.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v23.16b, v27.16b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "mov v22.16b, v27.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x28, x28, x11\n"
- "mov v21.16b, v27.16b\n"
- "ldp x23, x22, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
"add x27, x27, x11\n"
- "mov v19.16b, v27.16b\n"
- "ldr x21, [x20], #0x8\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 11f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
- "ssubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, %x[n_points], #0x1\n"
- "ssubl v3.8h, v3.8b, v10.8b\n"
- "ssubl v2.8h, v2.8b, v10.8b\n"
- "ssubl v1.8h, v1.8b, v10.8b\n"
- "ssubl v0.8h, v0.8b, v10.8b\n"
- "ssubl v31.8h, v31.8b, v10.8b\n"
- "ssubl v30.8h, v30.8b, v10.8b\n"
- "ssubl v29.8h, v29.8b, v10.8b\n"
- "ssubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x22, [x21], #0x8\n"
+ "add x10, x10, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
"add x9, x9, x11\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "ldp x25, x24, [x20], #0x10\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
"add x28, x28, x11\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x23, x22, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x27, x27, x11\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr x21, [x20], #0x8\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
"add x26, x26, x11\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
"add x25, x25, x11\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 14f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
- "ssubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, x19, #0x1\n"
- "ssubl v3.8h, v3.8b, v10.8b\n"
- "ssubl v2.8h, v2.8b, v10.8b\n"
- "ssubl v1.8h, v1.8b, v10.8b\n"
- "ssubl v0.8h, v0.8b, v10.8b\n"
- "ssubl v31.8h, v31.8b, v10.8b\n"
- "ssubl v30.8h, v30.8b, v10.8b\n"
- "ssubl v29.8h, v29.8b, v10.8b\n"
- "ssubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, x20, #0x1\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
"bgt 12b\n"
"15:" // Oddments: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 21f\n"
- "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v6.d }[0], [x21], #0x8\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
"cbz %x[rq_left_shift_ptr], 16f\n"
- "ld1 { v7.d }[0], [x19], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[2], [x21], #0x4\n"
- "ld1 { v5.s }[2], [x20], #0x4\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 17f\n"
- "ld1 { v7.s }[2], [x19], #0x4\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
"17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
"b 20f\n"
"18:" // Oddments: Load quantisation parameters: Bit 1: Unset
- "tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[0], [x21], #0x4\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 19f\n"
- "ld1 { v7.s }[0], [x19], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
"20:" // Oddments: Load quantisation parameters: Bit 1: End
"21:" // Oddments: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
"add x26, x26, x11\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
- "and v16.16b, v27.16b, v5.16b\n"
"add x24, x24, x11\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
"add x22, x22, x11\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
"add x20, x20, x11\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "add x19, x19, x11\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v24.4s, v24.4s, v11.4s\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_channels], #1, 22f\n"
- "st1 { v27.h }[0], [x27], #0x2\n"
- "st1 { v26.h }[0], [x26], #0x2\n"
- "st1 { v25.h }[0], [x25], #0x2\n"
- "st1 { v24.h }[0], [x24], #0x2\n"
- "st1 { v23.h }[0], [x23], #0x2\n"
- "st1 { v22.h }[0], [x22], #0x2\n"
- "st1 { v21.h }[0], [x21], #0x2\n"
- "st1 { v20.h }[0], [x20], #0x2\n"
- "st1 { v19.h }[0], [x19], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[2], [x27], #0x1\n"
- "st1 { v26.b }[2], [x26], #0x1\n"
- "st1 { v25.b }[2], [x25], #0x1\n"
- "st1 { v24.b }[2], [x24], #0x1\n"
- "st1 { v23.b }[2], [x23], #0x1\n"
- "st1 { v22.b }[2], [x22], #0x1\n"
- "st1 { v21.b }[2], [x21], #0x1\n"
- "st1 { v20.b }[2], [x20], #0x1\n"
- "st1 { v19.b }[2], [x19], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: Store: Bit 1: Unset
- "tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[0], [x27], #0x1\n"
- "st1 { v26.b }[0], [x26], #0x1\n"
- "st1 { v25.b }[0], [x25], #0x1\n"
- "st1 { v24.b }[0], [x24], #0x1\n"
- "st1 { v23.b }[0], [x23], #0x1\n"
- "st1 { v22.b }[0], [x22], #0x1\n"
- "st1 { v21.b }[0], [x21], #0x1\n"
- "st1 { v20.b }[0], [x20], #0x1\n"
- "st1 { v19.b }[0], [x19], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
"24:" // End
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index cbe3d2cd1c..342a297dd4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,487 +40,475 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "movi v5.16b, #0x1\n"
- "ldr x22, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "ushr v5.4s, v5.4s, #0x8\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v15.16b, #0x1\n"
+ "ushr v15.4s, v15.4s, #0x8\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "ldr q7, [%x[params], #0x30]\n"
"movi v26.4s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "mov x11, #0x0\n"
- "movi v1.4s, #0x0\n"
- "ld1 { v15.16b }, [x22]\n"
- "mov x10, #0x0\n"
- "movi v22.4s, #0x0\n"
- "ld1 { v29.16b }, [x20]\n"
- "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "movi v25.4s, #0x0\n"
- "ld1 { v0.16b }, [x19]\n"
- "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
- "movi v13.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "mov v29.16b, v1.16b\n"
+ "mov v16.16b, v1.16b\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v22.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v30.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "mov v23.16b, v4.16b\n"
+ "mov v21.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "mov v20.16b, v15.16b\n"
- "ldr x19, [%x[inptrs], #0x20]\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "mov v20.16b, v4.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x2\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x4\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov v25.16b, v0.16b\n"
+ "mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x2\n"
- "ld1r { v4.4s }, [x21]\n"
- "mov v17.16b, v15.16b\n"
- "ld1 { v2.16b }, [x20]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "ld1 { v7.16b }, [x19]\n"
- "mov v23.16b, v15.16b\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext v23.16b, v23.16b, v23.16b, #0x6\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "mov v18.16b, v29.16b\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "zip1 v15.4s, v15.4s, v17.4s\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x2\n"
- "ld1r { v14.4s }, [x9]\n"
- "zip1 v20.4s, v20.4s, v23.4s\n"
- "ld1r { v27.4s }, [x28]\n"
- "zip1 v15.4s, v15.4s, v20.4s\n"
- "ld1r { v23.4s }, [x27]\n"
- "mov v17.16b, v29.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "ldr q8, [%x[params], #0x10]\n"
- "mov v11.16b, v29.16b\n"
- "ldr q9, [%x[params], #0x20]\n"
- "ext v11.16b, v11.16b, v11.16b, #0x6\n"
- "ldr q10, [%x[params], #0x30]\n"
+ "mov x9, #0x0\n"
+ "mov v18.16b, v0.16b\n"
+ "mov v24.16b, v3.16b\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v3.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v29.4s, v29.4s, v17.4s\n"
- "mov v12.16b, v0.16b\n"
- "ext v12.16b, v12.16b, v12.16b, #0x2\n"
- "zip1 v18.4s, v18.4s, v11.4s\n"
- "zip1 v29.4s, v29.4s, v18.4s\n"
- "mov v17.16b, v0.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "mov v11.16b, v0.16b\n"
- "ext v11.16b, v11.16b, v11.16b, #0x6\n"
- "mov v18.16b, v2.16b\n"
- "zip1 v0.4s, v0.4s, v17.4s\n"
- "ext v18.16b, v18.16b, v18.16b, #0x2\n"
- "zip1 v12.4s, v12.4s, v11.4s\n"
- "zip1 v0.4s, v0.4s, v12.4s\n"
- "mov v17.16b, v2.16b\n"
+ "zip1 v1.4s, v1.4s, v16.4s\n"
+ "mov v16.16b, v3.16b\n"
+ "zip1 v29.4s, v29.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v31.4s\n"
+ "zip1 v22.4s, v22.4s, v30.4s\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "mov v19.16b, v2.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x6\n"
- "mov v28.16b, v7.16b\n"
- "zip1 v2.4s, v2.4s, v17.4s\n"
- "ext v28.16b, v28.16b, v28.16b, #0x2\n"
- "zip1 v18.4s, v18.4s, v19.4s\n"
- "zip1 v2.4s, v2.4s, v18.4s\n"
- "mov v18.16b, v7.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x4\n"
- "mov v21.16b, v7.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x6\n"
- "movi v30.4s, #0x0\n"
- "zip1 v7.4s, v7.4s, v18.4s\n"
- "movi v3.4s, #0x0\n"
- "zip1 v28.4s, v28.4s, v21.4s\n"
- "zip1 v7.4s, v7.4s, v28.4s\n"
- "movi v12.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v21.4s\n"
+ "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v0.4s, v0.4s, v19.4s\n"
+ "zip1 v25.4s, v25.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v29.4s\n"
+ "zip1 v2.4s, v2.4s, v22.4s\n"
+ ".inst 0x4f81e1fa // sdot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v3.4s, v3.4s, v17.4s\n"
+ "zip1 v24.4s, v24.4s, v16.4s\n"
+ ".inst 0x4fa1e1fb // sdot v27.4s, v15.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v23.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4f81e9f7 // sdot v23.4s, v15.16b, v1.4b[2]\n"
+ "movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
+ ".inst 0x4fa1e9f6 // sdot v22.4s, v15.16b, v1.4b[3]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x4f82e1f5 // sdot v21.4s, v15.16b, v2.4b[0]\n"
+ "movi v8.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4fa2e1f4 // sdot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
+ ".inst 0x4f82e9e9 // sdot v9.4s, v15.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "zip1 v0.4s, v0.4s, v25.4s\n"
+ ".inst 0x4fa2e9e8 // sdot v8.4s, v15.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v24.4s\n"
+ ".inst 0x4f84e1f3 // sdot v19.4s, v15.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e1f2 // sdot v18.4s, v15.16b, v4.4b[1]\n"
+ ".inst 0x4f84e9f1 // sdot v17.4s, v15.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e9f0 // sdot v16.4s, v15.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
- ".inst 0x4f8fe0ba // sdot v26.4s, v5.16b, v15.4b[0]\n"
- ".inst 0x4fafe0a1 // sdot v1.4s, v5.16b, v15.4b[1]\n"
- ".inst 0x4f8fe8b6 // sdot v22.4s, v5.16b, v15.4b[2]\n"
- ".inst 0x4fafe8b9 // sdot v25.4s, v5.16b, v15.4b[3]\n"
- ".inst 0x4f9de0ad // sdot v13.4s, v5.16b, v29.4b[0]\n"
- ".inst 0x4fbde0be // sdot v30.4s, v5.16b, v29.4b[1]\n"
- ".inst 0x4f9de8a3 // sdot v3.4s, v5.16b, v29.4b[2]\n"
- ".inst 0x4fbde8ac // sdot v12.4s, v5.16b, v29.4b[3]\n"
- ".inst 0x4f80e0ab // sdot v11.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0b3 // sdot v19.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8b5 // sdot v21.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f82e0b0 // sdot v16.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0bc // sdot v28.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x4f82e8b2 // sdot v18.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8b4 // sdot v20.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4f87e0b8 // sdot v24.4s, v5.16b, v7.4b[0]\n"
- ".inst 0x4fa7e0bf // sdot v31.4s, v5.16b, v7.4b[1]\n"
- "mov v26.16b, v26.16b\n"
- "mov v1.16b, v1.16b\n"
- "mov v22.16b, v22.16b\n"
- "mov v25.16b, v25.16b\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "movi v13.4s, #0x0\n"
- ".inst 0x4f87e8ad // sdot v13.4s, v5.16b, v7.4b[2]\n"
- "add v1.4s, v1.4s, v30.4s\n"
"movi v30.4s, #0x0\n"
- ".inst 0x4fa7e8be // sdot v30.4s, v5.16b, v7.4b[3]\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v11.4s\n"
- "add v1.4s, v1.4s, v19.4s\n"
- "add v22.4s, v22.4s, v21.4s\n"
- "add v25.4s, v25.4s, v17.4s\n"
- "mov v11.16b, v11.16b\n"
- "mov v3.16b, v19.16b\n"
- "mov v19.16b, v21.16b\n"
- "mov v21.16b, v17.16b\n"
- "add v11.4s, v11.4s, v16.4s\n"
- "add v3.4s, v3.4s, v28.4s\n"
- "add v19.4s, v19.4s, v18.4s\n"
- "add v21.4s, v21.4s, v20.4s\n"
- "add v11.4s, v11.4s, v24.4s\n"
- "add v3.4s, v3.4s, v31.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "neg v4.4s, v4.4s\n"
- "mul v26.4s, v26.4s, v4.4s\n"
- "str q26, [SP, #0x0]\n"
- "mul v1.4s, v1.4s, v4.4s\n"
- "mul v22.4s, v22.4s, v4.4s\n"
- "str q1, [SP, #0x10]\n"
- "mul v25.4s, v25.4s, v4.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "str q22, [SP, #0x20]\n"
- "mul v3.4s, v3.4s, v4.4s\n"
- "str q25, [SP, #0x30]\n"
- "mul v19.4s, v19.4s, v4.4s\n"
- "mul v21.4s, v21.4s, v4.4s\n"
- "str q11, [SP, #0x40]\n"
- "add v26.4s, v26.4s, v6.4s\n"
- "str q3, [SP, #0x50]\n"
- "add v1.4s, v1.4s, v6.4s\n"
- "str q19, [SP, #0x60]\n"
- "add v22.4s, v22.4s, v6.4s\n"
- "add v25.4s, v25.4s, v6.4s\n"
- "str q21, [SP, #0x70]\n"
- "add v11.4s, v11.4s, v6.4s\n"
- "add v3.4s, v3.4s, v6.4s\n"
- "add v19.4s, v19.4s, v6.4s\n"
- "add v21.4s, v21.4s, v6.4s\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f80e1ff // sdot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x4fa0e1fe // sdot v30.4s, v15.16b, v0.4b[1]\n"
+ ".inst 0x4f80e9fd // sdot v29.4s, v15.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e9fc // sdot v28.4s, v15.16b, v0.4b[3]\n"
+ "add v24.4s, v26.4s, v21.4s\n"
+ "add v25.4s, v27.4s, v20.4s\n"
+ "add v26.4s, v23.4s, v9.4s\n"
+ "add v27.4s, v22.4s, v8.4s\n"
+ "add v23.4s, v19.4s, v21.4s\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4f83e1f6 // sdot v22.4s, v15.16b, v3.4b[0]\n"
+ "add v21.4s, v18.4s, v20.4s\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4fa3e1f4 // sdot v20.4s, v15.16b, v3.4b[1]\n"
+ "add v19.4s, v17.4s, v9.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4f83e9f2 // sdot v18.4s, v15.16b, v3.4b[2]\n"
+ "add v17.4s, v16.4s, v8.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4fa3e9f0 // sdot v16.4s, v15.16b, v3.4b[3]\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add v28.4s, v23.4s, v22.4s\n"
+ "add v29.4s, v21.4s, v20.4s\n"
+ "add v30.4s, v19.4s, v18.4s\n"
+ "add v31.4s, v17.4s, v16.4s\n"
+ "neg v13.4s, v13.4s\n"
+ "mul v24.4s, v24.4s, v13.4s\n"
+ "mul v25.4s, v25.4s, v13.4s\n"
+ "mul v26.4s, v26.4s, v13.4s\n"
+ "mul v27.4s, v27.4s, v13.4s\n"
+ "mul v28.4s, v28.4s, v13.4s\n"
+ "mul v29.4s, v29.4s, v13.4s\n"
+ "mul v30.4s, v30.4s, v13.4s\n"
+ "mul v31.4s, v31.4s, v13.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"ble 2f\n"
"1:" // Loop
- ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n"
- "ldr q20, [%x[params], #0x0]\n"
- "add x11, x11, #0x10\n"
- ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n"
- "ldr q4, [%x[params], #0x10]\n"
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
+ "ldr q14, [%x[params], #0x20]\n"
+ ".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n"
- "ldr q6, [%x[params], #0x20]\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n"
- ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n"
- "ldr q8, [%x[params], #0x30]\n"
- ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n"
- ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n"
- ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n"
- ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n"
- ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n"
- ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n"
- "ldr q10, [%x[params], #0x50]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
+ "ldr q5, [%x[params], #0x30]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [%x[params], #0x50]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
"add %x[params], %x[params], #0x60\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v1.4s, v1.4s, v20.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "sqrdmulh v11.4s, v11.4s, v20.4s\n"
- "and v30.16b, v26.16b, v4.16b\n"
- "and v17.16b, v1.16b, v4.16b\n"
- "and v16.16b, v22.16b, v4.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v30.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v25.16b, v4.16b\n"
- "srshl v26.4s, v26.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v1.4s, v1.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v23.4s\n"
- "smin v1.4s, v1.4s, v23.4s\n"
- "smin v22.4s, v22.4s, v23.4s\n"
- "smax v26.4s, v26.4s, v27.4s\n"
- "smax v1.4s, v1.4s, v27.4s\n"
- "smax v22.4s, v22.4s, v27.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x26, x10]\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q26, [SP, #0x0]\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "str s1, [x25, x10]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q1, [SP, #0x10]\n"
- "and v16.16b, v11.16b, v4.16b\n"
- "str s22, [x24, x10]\n"
- "sqrdmulh v3.4s, v3.4s, v20.4s\n"
- "ldr q22, [SP, #0x20]\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v20.4s\n"
- "and v17.16b, v3.16b, v4.16b\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v23.4s\n"
- "and v16.16b, v19.16b, v4.16b\n"
- "srshl v11.4s, v11.4s, v4.4s\n"
- "smax v25.4s, v25.4s, v27.4s\n"
- "sqadd v3.4s, v3.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "smin v24.4s, v24.4s, v10.4s\n"
+ "smin v25.4s, v25.4s, v10.4s\n"
+ "smin v26.4s, v26.4s, v10.4s\n"
+ "smin v27.4s, v27.4s, v10.4s\n"
+ "smin v28.4s, v28.4s, v10.4s\n"
+ "smin v29.4s, v29.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v10.4s\n"
+ "smin v31.4s, v31.4s, v10.4s\n"
+ "smax v24.4s, v24.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v11.4s\n"
+ "smax v27.4s, v27.4s, v11.4s\n"
+ "smax v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v11.4s\n"
+ "smax v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v11.4s, v11.4s, v14.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x23, x10]\n"
- "smin v11.4s, v11.4s, v23.4s\n"
- "srshl v3.4s, v3.4s, v4.4s\n"
- "ldr q25, [SP, #0x30]\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v11.4s, v11.4s, v27.4s\n"
- "add v3.4s, v3.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v4.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "smin v3.4s, v3.4s, v23.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str s11, [x22, x10]\n"
- "smax v3.4s, v3.4s, v27.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "ldr q11, [SP, #0x40]\n"
- "and v16.16b, v21.16b, v4.16b\n"
- "add v26.4s, v26.4s, v6.4s\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "smin v19.4s, v19.4s, v23.4s\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "str s3, [x21, x10]\n"
- "smax v19.4s, v19.4s, v27.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr q3, [SP, #0x50]\n"
- "add v1.4s, v1.4s, v6.4s\n"
- "add v22.4s, v22.4s, v6.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x10]\n"
- "add v25.4s, v25.4s, v6.4s\n"
- "add v11.4s, v11.4s, v6.4s\n"
- "ldr q19, [SP, #0x60]\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "add v3.4s, v3.4s, v6.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v19.4s, v19.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v23.4s\n"
- "smax v21.4s, v21.4s, v27.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x19, x10]\n"
- "add x10, x10, #0x4\n"
- "ldr q21, [SP, #0x70]\n"
- "add v21.4s, v21.4s, v6.4s\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"bgt 1b\n"
"2:" // Tail
- ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n"
- "ldr q20, [%x[params], #0x0]\n"
- "add x26, x26, x10\n"
- ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n"
- "ldr q4, [%x[params], #0x10]\n"
- "add x25, x25, x10\n"
- ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n"
- "add x24, x24, x10\n"
- ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n"
- "add x23, x23, x10\n"
- ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n"
- "add x22, x22, x10\n"
- ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n"
- "add x21, x21, x10\n"
- ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
- "add x20, x20, x10\n"
- ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n"
- "add x19, x19, x10\n"
- ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n"
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n"
+ "add x27, x27, x28\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
+ "add x20, x20, x28\n"
"add %x[params], %x[params], #0x20\n"
- ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n"
- ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n"
- ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n"
- ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v1.4s, v1.4s, v20.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "and v30.16b, v26.16b, v4.16b\n"
- "and v17.16b, v1.16b, v4.16b\n"
- "and v16.16b, v22.16b, v4.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v30.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v25.16b, v4.16b\n"
- "srshl v26.4s, v26.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v1.4s, v1.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v23.4s\n"
- "smin v1.4s, v1.4s, v23.4s\n"
- "smin v22.4s, v22.4s, v23.4s\n"
- "smax v26.4s, v26.4s, v27.4s\n"
- "smax v1.4s, v1.4s, v27.4s\n"
- "smax v22.4s, v22.4s, v27.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v11.4s, v11.4s, v20.4s\n"
- "sqrdmulh v3.4s, v3.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v20.4s\n"
- "and v16.16b, v11.16b, v4.16b\n"
- "and v17.16b, v3.16b, v4.16b\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v23.4s\n"
- "sqadd v11.4s, v11.4s, v16.4s\n"
- "sqadd v3.4s, v3.4s, v17.4s\n"
- "smax v25.4s, v25.4s, v27.4s\n"
- "and v16.16b, v19.16b, v4.16b\n"
- "srshl v11.4s, v11.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "smin v24.4s, v24.4s, v10.4s\n"
+ "smin v25.4s, v25.4s, v10.4s\n"
+ "smin v26.4s, v26.4s, v10.4s\n"
+ "smin v27.4s, v27.4s, v10.4s\n"
+ "smin v28.4s, v28.4s, v10.4s\n"
+ "smin v29.4s, v29.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v10.4s\n"
+ "smin v31.4s, v31.4s, v10.4s\n"
+ "smax v24.4s, v24.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v11.4s\n"
+ "smax v27.4s, v27.4s, v11.4s\n"
+ "smax v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v11.4s\n"
+ "smax v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "srshl v3.4s, v3.4s, v4.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v11.4s, v11.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v3.4s, v3.4s, v14.4s\n"
- "smin v11.4s, v11.4s, v23.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v3.4s, v3.4s, v23.4s\n"
- "smax v11.4s, v11.4s, v27.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v3.4s, v3.4s, v27.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "srshl v19.4s, v19.4s, v4.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "and v16.16b, v21.16b, v4.16b\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v19.4s, v19.4s, v23.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v27.4s\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "smin v21.4s, v21.4s, v23.4s\n"
- "smax v21.4s, v21.4s, v27.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"blt 3f\n"
- "str s26, [x26, #0x0]\n"
- "str s1, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s11, [x22, #0x0]\n"
- "str s3, [x21, #0x0]\n"
- "str s19, [x20, #0x0]\n"
- "str s21, [x19, #0x0]\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
"b 4f\n"
"3:" // Tail: Oddments
- "st1 { v26.b }[0], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v25.b }[0], [x23], #0x1\n"
- "st1 { v11.b }[0], [x22], #0x1\n"
- "st1 { v3.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "st1 { v21.b }[0], [x19], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[1], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[1], [x25], #0x1\n"
- "st1 { v22.b }[1], [x24], #0x1\n"
- "st1 { v25.b }[1], [x23], #0x1\n"
- "st1 { v11.b }[1], [x22], #0x1\n"
- "st1 { v3.b }[1], [x21], #0x1\n"
- "st1 { v19.b }[1], [x20], #0x1\n"
- "st1 { v21.b }[1], [x19], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[2], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v25.b }[2], [x23], #0x1\n"
- "st1 { v11.b }[2], [x22], #0x1\n"
- "st1 { v3.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "st1 { v21.b }[2], [x19], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[3], [x26], #0x1\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[3], [x25], #0x1\n"
- "st1 { v22.b }[3], [x24], #0x1\n"
- "st1 { v25.b }[3], [x23], #0x1\n"
- "st1 { v11.b }[3], [x22], #0x1\n"
- "st1 { v3.b }[3], [x21], #0x1\n"
- "st1 { v19.b }[3], [x20], #0x1\n"
- "st1 { v21.b }[3], [x19], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
"4:" // Tail: End
- "add SP, SP, #0x80\n"
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index b198eff6ac..9fa38c6efe 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,622 +40,596 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "movi v15.16b, #0x1\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "movi v14.4s, #0x1\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "movi v28.4s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "mov x11, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ld1 { v13.16b }, [x21]\n"
- "mov x10, #0x0\n"
- "movi v26.4s, #0x0\n"
- "ld1 { v12.16b }, [x20]\n"
- "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "movi v25.4s, #0x0\n"
- "ld1 { v7.16b }, [x19]\n"
- "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "movi v28.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "movi v31.4s, #0x0\n"
"movi v24.4s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "mov v18.16b, v13.16b\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "movi v30.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "cmp %x[n_channels], #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "mov v17.16b, v12.16b\n"
- "ld1 { v6.16b }, [x21]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ld1 { v5.16b }, [x20]\n"
- "mov v16.16b, v7.16b\n"
- "ld1 { v4.16b }, [x19]\n"
+ "mov v16.16b, v3.16b\n"
"ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "zip1 v13.2d, v13.2d, v18.2d\n"
- "ldr x19, [%x[inptrs], #0x38]\n"
- "zip1 v12.2d, v12.2d, v17.2d\n"
- "ld1r { v3.4s }, [x22]\n"
- "mov v18.16b, v6.16b\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov v15.16b, v4.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
- "zip1 v7.2d, v7.2d, v16.2d\n"
- "ld1 { v1.16b }, [x19]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "mov v17.16b, v5.16b\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "mov v16.16b, v4.16b\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "zip1 v6.2d, v6.2d, v18.2d\n"
- "ld1r { v0.4s }, [x9]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ld1r { v31.4s }, [x28]\n"
- "zip1 v5.2d, v5.2d, v17.2d\n"
- "ld1r { v30.4s }, [x27]\n"
- "mov v17.16b, v2.16b\n"
- "ldr q29, [%x[params], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "mov v20.16b, v2.16b\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "zip1 v3.2d, v3.2d, v16.2d\n"
+ "zip1 v4.2d, v4.2d, v15.2d\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "mov v26.16b, v1.16b\n"
+ "mov v13.16b, v5.16b\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x38]\n"
+ "mov v19.16b, v6.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v7.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "mov v17.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v20.2d\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4f83e392 // sdot v18.4s, v28.16b, v3.4b[0]\n"
"ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ldr q8, [%x[params], #0x10]\n"
- "zip1 v4.2d, v4.2d, v16.2d\n"
- "ldr q9, [%x[params], #0x20]\n"
- "mov v16.16b, v1.16b\n"
- "ldr q10, [%x[params], #0x30]\n"
+ ".inst 0x4f83eb9f // sdot v31.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x4f84e398 // sdot v24.4s, v28.16b, v4.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ ".inst 0x4f84eb9e // sdot v30.4s, v28.16b, v4.4b[2]\n"
+ "mov v16.16b, v0.16b\n"
+ ".inst 0x4f82e395 // sdot v21.4s, v28.16b, v2.4b[0]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v29.4s, #0x1\n"
+ ".inst 0x4f82eb94 // sdot v20.4s, v28.16b, v2.4b[2]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
"ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ldr q11, [%x[params], #0x40]\n"
- "add %x[params], %x[params], #0x50\n"
- "zip1 v2.2d, v2.2d, v17.2d\n"
- "movi v23.4s, #0x0\n"
+ "zip1 v1.2d, v1.2d, v26.2d\n"
+ ".inst 0x4fa3e3b2 // sdot v18.4s, v29.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v13.2d\n"
+ "zip1 v6.2d, v6.2d, v19.2d\n"
+ ".inst 0x4fa3ebbf // sdot v31.4s, v29.16b, v3.4b[3]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v17.2d\n"
"movi v22.4s, #0x0\n"
- "zip1 v1.2d, v1.2d, v16.2d\n"
- "movi v21.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
+ ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x4fa4ebbe // sdot v30.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x4f81e396 // sdot v22.4s, v28.16b, v1.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "movi v25.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ ".inst 0x4f81eb9a // sdot v26.4s, v28.16b, v1.4b[2]\n"
+ "zip1 v0.2d, v0.2d, v16.2d\n"
"movi v19.4s, #0x0\n"
- ".inst 0x4f8de1fc // sdot v28.4s, v15.16b, v13.4b[0]\n"
- ".inst 0x4f8de9fb // sdot v27.4s, v15.16b, v13.4b[2]\n"
- ".inst 0x4f8ce1fa // sdot v26.4s, v15.16b, v12.4b[0]\n"
- ".inst 0x4f8ce9f9 // sdot v25.4s, v15.16b, v12.4b[2]\n"
- ".inst 0x4fade1dc // sdot v28.4s, v14.16b, v13.4b[1]\n"
- ".inst 0x4fade9db // sdot v27.4s, v14.16b, v13.4b[3]\n"
- ".inst 0x4face1da // sdot v26.4s, v14.16b, v12.4b[1]\n"
- ".inst 0x4face9d9 // sdot v25.4s, v14.16b, v12.4b[3]\n"
- ".inst 0x4f87e1f8 // sdot v24.4s, v15.16b, v7.4b[0]\n"
- ".inst 0x4f87e9f7 // sdot v23.4s, v15.16b, v7.4b[2]\n"
- ".inst 0x4f86e1f6 // sdot v22.4s, v15.16b, v6.4b[0]\n"
- ".inst 0x4f86e9f5 // sdot v21.4s, v15.16b, v6.4b[2]\n"
- ".inst 0x4fa7e1d8 // sdot v24.4s, v14.16b, v7.4b[1]\n"
- ".inst 0x4fa7e9d7 // sdot v23.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x4fa6e1d6 // sdot v22.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x4fa6e9d5 // sdot v21.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4f85e1f2 // sdot v18.4s, v15.16b, v5.4b[0]\n"
- ".inst 0x4f85e9f1 // sdot v17.4s, v15.16b, v5.4b[2]\n"
- ".inst 0x4f84e1f0 // sdot v16.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x4f84e9f4 // sdot v20.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x4fa5e1d2 // sdot v18.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa5e9d1 // sdot v17.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa4e1d0 // sdot v16.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa4e9d4 // sdot v20.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4f82e1f3 // sdot v19.4s, v15.16b, v2.4b[0]\n"
- "mov v28.16b, v28.16b\n"
- "mov v27.16b, v27.16b\n"
- "add v28.4s, v28.4s, v26.4s\n"
- ".inst 0x4fa2e1d3 // sdot v19.4s, v14.16b, v2.4b[1]\n"
- "add v27.4s, v27.4s, v25.4s\n"
- "add v28.4s, v28.4s, v24.4s\n"
- "mov v26.16b, v26.16b\n"
- "add v27.4s, v27.4s, v23.4s\n"
- "add v28.4s, v28.4s, v22.4s\n"
- "mov v25.16b, v25.16b\n"
- "add v27.4s, v27.4s, v21.4s\n"
- "add v28.4s, v28.4s, v18.4s\n"
- "add v26.4s, v26.4s, v24.4s\n"
- "add v27.4s, v27.4s, v17.4s\n"
- "add v25.4s, v25.4s, v23.4s\n"
- "add v26.4s, v26.4s, v22.4s\n"
- "mov v24.16b, v24.16b\n"
- "add v25.4s, v25.4s, v21.4s\n"
- "add v26.4s, v26.4s, v18.4s\n"
- "mov v23.16b, v23.16b\n"
- "add v25.4s, v25.4s, v17.4s\n"
- "add v26.4s, v26.4s, v16.4s\n"
- "add v24.4s, v24.4s, v22.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v23.4s, v23.4s, v21.4s\n"
- "add v24.4s, v24.4s, v18.4s\n"
- "mov v22.16b, v22.16b\n"
- "add v23.4s, v23.4s, v17.4s\n"
- "add v24.4s, v24.4s, v16.4s\n"
- "mov v21.16b, v21.16b\n"
- "add v23.4s, v23.4s, v20.4s\n"
- "add v24.4s, v24.4s, v19.4s\n"
- "add v22.4s, v22.4s, v18.4s\n"
+ ".inst 0x4f85e399 // sdot v25.4s, v28.16b, v5.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x4f85eb9b // sdot v27.4s, v28.16b, v5.4b[2]\n"
+ ".inst 0x4f86e393 // sdot v19.4s, v28.16b, v6.4b[0]\n"
+ "add v24.4s, v18.4s, v24.4s\n"
+ "mov x9, #0x0\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f82e9f2 // sdot v18.4s, v15.16b, v2.4b[2]\n"
- "add v21.4s, v21.4s, v17.4s\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4f81e1f1 // sdot v17.4s, v15.16b, v1.4b[0]\n"
- ".inst 0x4fa2e9d2 // sdot v18.4s, v14.16b, v2.4b[3]\n"
- "add v22.4s, v22.4s, v16.4s\n"
+ ".inst 0x4f86eb92 // sdot v18.4s, v28.16b, v6.4b[2]\n"
+ ".inst 0x4fa2e3b5 // sdot v21.4s, v29.16b, v2.4b[1]\n"
+ "mov x28, #0x0\n"
+ ".inst 0x4fa2ebb4 // sdot v20.4s, v29.16b, v2.4b[3]\n"
+ "add v17.4s, v31.4s, v30.4s\n"
+ ".inst 0x4fa1e3b6 // sdot v22.4s, v29.16b, v1.4b[1]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
"movi v16.4s, #0x0\n"
- ".inst 0x4fa1e1d1 // sdot v17.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4f81e9f0 // sdot v16.4s, v15.16b, v1.4b[2]\n"
- "add v23.4s, v23.4s, v18.4s\n"
- "add v21.4s, v21.4s, v20.4s\n"
- "add v22.4s, v22.4s, v19.4s\n"
- ".inst 0x4fa1e9d0 // sdot v16.4s, v14.16b, v1.4b[3]\n"
- "add v21.4s, v21.4s, v18.4s\n"
- "add v22.4s, v22.4s, v17.4s\n"
- "neg v3.4s, v3.4s\n"
- "add v21.4s, v21.4s, v16.4s\n"
- "mul v28.4s, v28.4s, v3.4s\n"
- "str q28, [SP, #0x0]\n"
- "mul v27.4s, v27.4s, v3.4s\n"
- "mul v26.4s, v26.4s, v3.4s\n"
- "str q27, [SP, #0x10]\n"
- "mul v25.4s, v25.4s, v3.4s\n"
- "mul v24.4s, v24.4s, v3.4s\n"
- "str q26, [SP, #0x20]\n"
- "mul v23.4s, v23.4s, v3.4s\n"
- "str q25, [SP, #0x30]\n"
- "mul v22.4s, v22.4s, v3.4s\n"
- "mul v21.4s, v21.4s, v3.4s\n"
- "str q24, [SP, #0x40]\n"
- "add v28.4s, v28.4s, v29.4s\n"
- "str q23, [SP, #0x50]\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "str q22, [SP, #0x60]\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q21, [SP, #0x70]\n"
- "add v24.4s, v24.4s, v29.4s\n"
- "add v23.4s, v23.4s, v29.4s\n"
- "add v22.4s, v22.4s, v29.4s\n"
- "add v21.4s, v21.4s, v29.4s\n"
+ ".inst 0x4f87e390 // sdot v16.4s, v28.16b, v7.4b[0]\n"
+ ".inst 0x4fa1ebba // sdot v26.4s, v29.16b, v1.4b[3]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ ".inst 0x4fa5e3b9 // sdot v25.4s, v29.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ebbb // sdot v27.4s, v29.16b, v5.4b[3]\n"
+ "add v30.4s, v21.4s, v24.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ ".inst 0x4fa6e3b3 // sdot v19.4s, v29.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ebb2 // sdot v18.4s, v29.16b, v6.4b[3]\n"
+ "add v31.4s, v20.4s, v17.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ ".inst 0x4fa7e3b0 // sdot v16.4s, v29.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add %x[params], %x[params], #0x50\n"
+ "add v21.4s, v26.4s, v31.4s\n"
+ "add v20.4s, v25.4s, v19.4s\n"
+ "add v19.4s, v27.4s, v18.4s\n"
+ "add v18.4s, v16.4s, v24.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4f87eb90 // sdot v16.4s, v28.16b, v7.4b[2]\n"
+ ".inst 0x4fa7ebb0 // sdot v16.4s, v29.16b, v7.4b[3]\n"
+ "add v17.4s, v16.4s, v17.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e3b0 // sdot v16.4s, v29.16b, v0.4b[1]\n"
+ "add v24.4s, v22.4s, v16.4s\n"
+ "add v26.4s, v22.4s, v25.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x4fa0ebb0 // sdot v16.4s, v29.16b, v0.4b[3]\n"
+ "add v25.4s, v21.4s, v16.4s\n"
+ "add v27.4s, v21.4s, v27.4s\n"
+ "add v28.4s, v20.4s, v30.4s\n"
+ "add v29.4s, v19.4s, v31.4s\n"
+ "add v30.4s, v18.4s, v20.4s\n"
+ "add v31.4s, v17.4s, v19.4s\n"
+ "neg v23.4s, v23.4s\n"
+ "mul v24.4s, v24.4s, v23.4s\n"
+ "mul v25.4s, v25.4s, v23.4s\n"
+ "mul v26.4s, v26.4s, v23.4s\n"
+ "mul v27.4s, v27.4s, v23.4s\n"
+ "mul v28.4s, v28.4s, v23.4s\n"
+ "mul v29.4s, v29.4s, v23.4s\n"
+ "mul v30.4s, v30.4s, v23.4s\n"
+ "mul v31.4s, v31.4s, v23.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n"
- "ldr q20, [%x[params], #0x60]\n"
- "add x11, x11, #0x10\n"
- ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n"
- "ldr q19, [%x[params], #0x70]\n"
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
+ "ldr q12, [%x[params], #0x80]\n"
+ ".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n"
- "ldr q29, [%x[params], #0x80]\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n"
- ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
"ldr q8, [%x[params], #0x0]\n"
- ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n"
- ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n"
- ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n"
- ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n"
- ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q9, [%x[params], #0x10]\n"
- ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n"
- ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n"
- ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n"
- ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n"
- ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n"
- ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n"
- ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
"ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n"
- ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
"ldr q9, [%x[params], #0x50]\n"
- ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n"
- "ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%x[params], #0x90]\n"
- ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqrdmulh v27.4s, v27.4s, v20.4s\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "sqrdmulh v24.4s, v24.4s, v20.4s\n"
- "and v18.16b, v28.16b, v19.16b\n"
- "and v17.16b, v27.16b, v19.16b\n"
- "and v16.16b, v26.16b, v19.16b\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v26.4s, v26.4s, v16.4s\n"
- "and v16.16b, v25.16b, v19.16b\n"
- "srshl v28.4s, v28.4s, v19.4s\n"
- "srshl v27.4s, v27.4s, v19.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v27.4s, v27.4s, v0.4s\n"
- "add v26.4s, v26.4s, v0.4s\n"
- "smin v28.4s, v28.4s, v30.4s\n"
- "smin v27.4s, v27.4s, v30.4s\n"
- "smin v26.4s, v26.4s, v30.4s\n"
- "smax v28.4s, v28.4s, v31.4s\n"
- "smax v27.4s, v27.4s, v31.4s\n"
- "smax v26.4s, v26.4s, v31.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x26, x10]\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "ldr q28, [SP, #0x0]\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "str s27, [x25, x10]\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "ldr q27, [SP, #0x10]\n"
- "and v16.16b, v24.16b, v19.16b\n"
- "str s26, [x24, x10]\n"
- "sqrdmulh v23.4s, v23.4s, v20.4s\n"
- "ldr q26, [SP, #0x20]\n"
- "srshl v25.4s, v25.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "and v17.16b, v23.16b, v19.16b\n"
- "add v25.4s, v25.4s, v0.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v30.4s\n"
- "and v16.16b, v22.16b, v19.16b\n"
- "srshl v24.4s, v24.4s, v19.4s\n"
- "smax v25.4s, v25.4s, v31.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x23, x10]\n"
- "smin v24.4s, v24.4s, v30.4s\n"
- "srshl v23.4s, v23.4s, v19.4s\n"
- "ldr q25, [SP, #0x30]\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v24.4s, v24.4s, v31.4s\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "srshl v22.4s, v22.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "smin v23.4s, v23.4s, v30.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x22, x10]\n"
- "smax v23.4s, v23.4s, v31.4s\n"
- "add v22.4s, v22.4s, v0.4s\n"
- "ldr q24, [SP, #0x40]\n"
- "and v16.16b, v21.16b, v19.16b\n"
- "add v28.4s, v28.4s, v29.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v22.4s, v22.4s, v30.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x21, x10]\n"
- "smax v22.4s, v22.4s, v31.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr q23, [SP, #0x50]\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x20, x10]\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "add v24.4s, v24.4s, v29.4s\n"
- "ldr q22, [SP, #0x60]\n"
- "srshl v21.4s, v21.4s, v19.4s\n"
- "add v23.4s, v23.4s, v29.4s\n"
- "add v21.4s, v21.4s, v0.4s\n"
- "add v22.4s, v22.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v30.4s\n"
- "smax v21.4s, v21.4s, v31.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x19, x10]\n"
- "add x10, x10, #0x4\n"
- "ldr q21, [SP, #0x70]\n"
- "add v21.4s, v21.4s, v29.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
"bgt 1b\n"
"2:" // Tail
- ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n"
- "ldr q20, [%x[params], #0x60]\n"
- "add x26, x26, x10\n"
- ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n"
- "ldr q19, [%x[params], #0x70]\n"
- "add x25, x25, x10\n"
- ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n"
- "add x24, x24, x10\n"
- ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n"
- "add x23, x23, x10\n"
- ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n"
- "add x22, x22, x10\n"
- ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n"
- "add x21, x21, x10\n"
- ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n"
- "add x20, x20, x10\n"
- ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
- "add x19, x19, x10\n"
- ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n"
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n"
- ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n"
- ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n"
- ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n"
+ "add x27, x27, x28\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "add x20, x20, x28\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q9, [%x[params], #0x10]\n"
- ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n"
- ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n"
- ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q10, [%x[params], #0x20]\n"
- ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n"
- ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n"
- ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n"
- ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n"
- ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e11a // sdot v26.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91b // sdot v27.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q11, [%x[params], #0x30]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e939 // sdot v25.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13a // sdot v26.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93b // sdot v27.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x4f84e11c // sdot v28.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e91d // sdot v29.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4f85e11e // sdot v30.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91f // sdot v31.4s, v8.16b, v5.4b[2]\n"
"ldr q8, [%x[params], #0x40]\n"
- ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n"
- ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4f83e158 // sdot v24.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e959 // sdot v25.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15a // sdot v26.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95b // sdot v27.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e13c // sdot v28.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e93d // sdot v29.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e13e // sdot v30.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93f // sdot v31.4s, v9.16b, v5.4b[3]\n"
"ldr q9, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n"
- ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4fa3e178 // sdot v24.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e979 // sdot v25.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17a // sdot v26.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97b // sdot v27.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x4f85e15c // sdot v28.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e95d // sdot v29.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4f86e15e // sdot v30.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e95f // sdot v31.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x4f84e118 // sdot v24.4s, v8.16b, v4.4b[0]\n"
".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4f85e11a // sdot v26.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e17c // sdot v28.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e97d // sdot v29.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e17e // sdot v30.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e97f // sdot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4fa4e138 // sdot v24.4s, v9.16b, v4.4b[1]\n"
".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqrdmulh v27.4s, v27.4s, v20.4s\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "and v18.16b, v28.16b, v19.16b\n"
- "and v17.16b, v27.16b, v19.16b\n"
- "and v16.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4fa5e13a // sdot v26.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f86e11c // sdot v28.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x4f86e91d // sdot v29.4s, v8.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f87e11e // sdot v30.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x4f87e91f // sdot v31.4s, v8.16b, v7.4b[2]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4fa6e13c // sdot v28.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e93d // sdot v29.4s, v9.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4fa7e13e // sdot v30.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e93f // sdot v31.4s, v9.16b, v7.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v26.4s, v26.4s, v16.4s\n"
- "and v16.16b, v25.16b, v19.16b\n"
- "srshl v28.4s, v28.4s, v19.4s\n"
- "srshl v27.4s, v27.4s, v19.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v27.4s, v27.4s, v0.4s\n"
- "add v26.4s, v26.4s, v0.4s\n"
- "smin v28.4s, v28.4s, v30.4s\n"
- "smin v27.4s, v27.4s, v30.4s\n"
- "smin v26.4s, v26.4s, v30.4s\n"
- "smax v28.4s, v28.4s, v31.4s\n"
- "smax v27.4s, v27.4s, v31.4s\n"
- "smax v26.4s, v26.4s, v31.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sqrdmulh v24.4s, v24.4s, v20.4s\n"
- "sqrdmulh v23.4s, v23.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v19.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "and v16.16b, v24.16b, v19.16b\n"
- "and v17.16b, v23.16b, v19.16b\n"
- "add v25.4s, v25.4s, v0.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v30.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "smax v25.4s, v25.4s, v31.4s\n"
- "and v16.16b, v22.16b, v19.16b\n"
- "srshl v24.4s, v24.4s, v19.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "srshl v23.4s, v23.4s, v19.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v0.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "smin v24.4s, v24.4s, v30.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v30.4s\n"
- "smax v24.4s, v24.4s, v31.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v23.4s, v23.4s, v31.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "and v16.16b, v21.16b, v19.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "add v22.4s, v22.4s, v0.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v22.4s, v22.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v31.4s\n"
- "srshl v21.4s, v21.4s, v19.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "add v21.4s, v21.4s, v0.4s\n"
- "smin v21.4s, v21.4s, v30.4s\n"
- "smax v21.4s, v21.4s, v31.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"blt 3f\n"
- "str s28, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s26, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
- "str s23, [x21, #0x0]\n"
- "str s22, [x20, #0x0]\n"
- "str s21, [x19, #0x0]\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
"b 4f\n"
"3:" // Tail: Oddments
- "st1 { v28.b }[0], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v25.b }[0], [x23], #0x1\n"
- "st1 { v24.b }[0], [x22], #0x1\n"
- "st1 { v23.b }[0], [x21], #0x1\n"
- "st1 { v22.b }[0], [x20], #0x1\n"
- "st1 { v21.b }[0], [x19], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[1], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[1], [x25], #0x1\n"
- "st1 { v26.b }[1], [x24], #0x1\n"
- "st1 { v25.b }[1], [x23], #0x1\n"
- "st1 { v24.b }[1], [x22], #0x1\n"
- "st1 { v23.b }[1], [x21], #0x1\n"
- "st1 { v22.b }[1], [x20], #0x1\n"
- "st1 { v21.b }[1], [x19], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[2], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v25.b }[2], [x23], #0x1\n"
- "st1 { v24.b }[2], [x22], #0x1\n"
- "st1 { v23.b }[2], [x21], #0x1\n"
- "st1 { v22.b }[2], [x20], #0x1\n"
- "st1 { v21.b }[2], [x19], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[3], [x26], #0x1\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[3], [x25], #0x1\n"
- "st1 { v26.b }[3], [x24], #0x1\n"
- "st1 { v25.b }[3], [x23], #0x1\n"
- "st1 { v24.b }[3], [x22], #0x1\n"
- "st1 { v23.b }[3], [x21], #0x1\n"
- "st1 { v22.b }[3], [x20], #0x1\n"
- "st1 { v21.b }[3], [x19], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
"4:" // Tail: End
- "add SP, SP, #0x80\n"
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index bbfa9f439f..3a544e0697 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,1439 +45,1433 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
)
{
__asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v3.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
"mov x9, #0x0\n"
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v13.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v11.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v9.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v8.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v7.4s }, [x19]\n"
- "lsr x28, %x[n_output_channels], #0x2\n"
- "cbz x28, 9f\n"
+ "cbz x10, 9f\n"
"1:" // Output channel loop
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q16, [%x[bias], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 3f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q8, [%x[rq_mul_ptr], x19]\n"
- "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 7f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 7f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v3.8b\n"
+ "ssubl v0.8h, v0.8b, v3.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ssubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "6:" // Output channel loop: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldr d7, [x28, #0x0]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "7:" // Output channel loop: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
- "cmp x9, x28, LSL #2\n"
+ "cmp x9, x10, LSL #2\n"
"blt 1b\n"
"tst %x[n_output_channels], #0x3\n"
"beq 26f\n"
"9:" // Output channel oddments
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 12f\n"
- "add x19, %x[bias], x9, LSL #2\n"
+ "add x20, %x[bias], x9, LSL #2\n"
"tbz %x[n_output_channels], #1, 10f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v31.s }[2], [x20]\n"
"b 11f\n"
"10:" // Output channel oddments: Load bias: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "ld1 { v31.s }[0], [x20]\n"
"11:" // Output channel oddments: Load bias: Bit 1: End
-
"12:" // Output channel oddments: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 18f\n"
- "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
-
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 22f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 22f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v3.8b\n"
+ "ssubl v0.8h, v0.8b, v3.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ssubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v3.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "ssubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.h }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.h }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.h }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.h }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.h }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.h }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.h }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.h }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.h }[0], [x25]\n"
+ "st1 { v23.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v19.h }[0], [x26]\n"
+ "st1 { v24.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x21]\n"
+ "st1 { v26.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x23]\n"
+ "st1 { v28.h }[0], [x24]\n"
+ "st1 { v29.h }[0], [x25]\n"
+ "st1 { v30.h }[0], [x26]\n"
+ "st1 { v31.h }[0], [x27]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[2], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[2], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v19.b }[2], [x26]\n"
+ "st1 { v23.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v25.b }[2], [x21]\n"
+ "st1 { v26.b }[2], [x22]\n"
+ "st1 { v27.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x24]\n"
+ "st1 { v29.b }[2], [x25]\n"
+ "st1 { v30.b }[2], [x26]\n"
+ "st1 { v31.b }[2], [x27]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[0], [x25]\n"
- "st1 { v19.b }[0], [x26]\n"
+ "st1 { v23.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[0], [x20]\n"
+ "st1 { v25.b }[0], [x21]\n"
+ "st1 { v26.b }[0], [x22]\n"
+ "st1 { v27.b }[0], [x23]\n"
+ "st1 { v28.b }[0], [x24]\n"
+ "st1 { v29.b }[0], [x25]\n"
+ "st1 { v30.b }[0], [x26]\n"
+ "st1 { v31.b }[0], [x27]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
"26:" // Done
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 761c7ec86e..3fc1b13d9c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,1148 +41,1450 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x15, x14, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "lsr x15, %x[n_channels], #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "mov x12, #0x0\n"
"mov x11, #0x0\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "lsr x28, %x[n_channels], #0x4\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "add x25, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
"ldp x24, x23, [%x[outptrs], #0x0]\n"
- "add x22, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ldp x21, x20, [%x[outptrs], #0x10]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v12.4s }, [x25]\n"
- "ld1r { v11.4s }, [x22]\n"
- "ld1r { v10.4s }, [x19]\n"
- "cbz x28, 2f\n"
- "1:" // Loop
- "ldr q27, [x15, x11]\n"
- "subs x28, x28, #0x1\n"
- "ldr q1, [x14, x11]\n"
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldr q25, [x13, x11]\n"
- "zip1 v6.16b, v27.16b, v25.16b\n"
- "ldr q23, [x12, x11]\n"
- "zip2 v9.16b, v27.16b, v25.16b\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldr q31, [x10, x11]\n"
- "zip1 v5.16b, v1.16b, v23.16b\n"
- "ldr q28, [x9, x11]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "zip1 v8.16b, v6.16b, v5.16b\n"
- "ldr q21, [x27, x11]\n"
- "zip2 v7.16b, v6.16b, v5.16b\n"
- "ldr q26, [x26, x11]\n"
- "zip1 v6.16b, v9.16b, v3.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "zip2 v5.16b, v9.16b, v3.16b\n"
- "ldr q24, [x15, x11]\n"
- "ldr q22, [x14, x11]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldp x15, x14, [%x[inptrs], #0x0]\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "ldr q20, [x13, x11]\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "ldr q16, [x12, x11]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "ldr q19, [x10, x11]\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "ldr q0, [x9, x11]\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "ldr q18, [x27, x11]\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "ldr q17, [x26, x11]\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q6, [SP, #0x0]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
+ "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "cbz x15, 3f\n"
+ "ldr q6, [x14, x12]\n"
+ "ldr q5, [x13, x12]\n"
+ "subs x15, x15, #0x1\n"
+ "ldr q4, [x10, x12]\n"
+ "ldr q3, [x9, x12]\n"
+ "zip2 v2.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
+ "ldr q1, [x28, x12]\n"
+ "ldr q0, [x27, x12]\n"
+ "zip1 v4.16b, v5.16b, v3.16b\n"
+ "zip2 v3.16b, v5.16b, v3.16b\n"
+ "ldr q31, [x26, x12]\n"
+ "ldr q30, [x25, x12]\n"
+ "zip2 v5.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
"ldr q29, [%x[params], #0x10]\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- "ldr q8, [SP, #0x0]\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x70]\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- "ldr q3, [SP, #0x20]\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x80]\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- "ldr q31, [SP, #0x40]\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x90]\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "ldr q26, [SP, #0x60]\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0xa0]\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0xb0]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x24, x11]\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q30, [%x[params], #0x60]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x21, x11]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x11]\n"
- "mov v19.16b, v30.16b\n"
+ "ldr q28, [%x[params], #0x20]\n"
+ "zip1 v4.16b, v2.16b, v3.16b\n"
+ "zip2 v3.16b, v2.16b, v3.16b\n"
+ "ldr q2, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x30]\n"
+ "zip2 v26.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "ldp x14, x13, [%x[inptrs], #0x40]\n"
+ "ldr q25, [x14, x12]\n"
+ "zip1 v31.16b, v0.16b, v30.16b\n"
+ "zip2 v30.16b, v0.16b, v30.16b\n"
+ "ldr q24, [x13, x12]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "zip2 v0.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "ldr q23, [x10, x12]\n"
+ "ldr q22, [x9, x12]\n"
+ "zip2 v21.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldr q20, [x28, x12]\n"
+ "zip1 v23.16b, v24.16b, v22.16b\n"
+ "zip2 v22.16b, v24.16b, v22.16b\n"
+ "ldr q19, [x27, x12]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip1 v31.16b, v26.16b, v30.16b\n"
+ "zip2 v30.16b, v26.16b, v30.16b\n"
+ "ldr q18, [x26, x12]\n"
+ "ldr q17, [x25, x12]\n"
+ "zip2 v16.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v19.16b, v17.16b\n"
+ "zip2 v17.16b, v19.16b, v17.16b\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip2 v24.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "zip1 v23.16b, v21.16b, v22.16b\n"
+ "zip2 v22.16b, v21.16b, v22.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v19.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v16.16b, v17.16b\n"
+ "zip2 v17.16b, v16.16b, v17.16b\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "add x12, x12, #0x10\n"
+ ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
+ "ldr q6, [%x[params], #0x0]\n"
+ ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
+ ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0x60]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0x40]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0x70]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0x20]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "str s16, [x21, x11]\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
"add x11, x11, #0x4\n"
- ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
+ "ldr q5, [x13, x12]\n"
+ ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
+ ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
"ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
- "ldr q7, [SP, #0x10]\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- "ldr q29, [%x[params], #0xd0]\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "ldr q2, [SP, #0x30]\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- "ldr q27, [%x[params], #0xe0]\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- "ldr q28, [SP, #0x50]\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "ldr q25, [%x[params], #0xf0]\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "ldr q24, [SP, #0x70]\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x100]\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x110]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x24, x11]\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q30, [%x[params], #0xc0]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x21, x11]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x11]\n"
- "mov v19.16b, v30.16b\n"
+ ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
+ "ldr q0, [x27, x12]\n"
+ ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0xc0]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0xb0]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0x90]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
+ ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
"add x11, x11, #0x4\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
"ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x130]\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x140]\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x150]\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x160]\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x170]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x24, x11]\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q30, [%x[params], #0x120]\n"
- "add %x[params], %x[params], #0x180\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v22.16b, v30.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x21, x11]\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "mov v20.16b, v30.16b\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x11]\n"
- "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
+ "ldr q4, [x10, x12]\n"
+ ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
+ ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
+ "ldr q31, [x26, x12]\n"
+ ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0x120]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0x100]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0xf0]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0x130]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0xe0]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "str s16, [x21, x11]\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add x11, x11, #0x4\n"
- ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x24, x11]\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x21, x11]\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x11]\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
+ "ldr q3, [x9, x12]\n"
+ ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
+ ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
+ "ldr q30, [x25, x12]\n"
+ ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
+ ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [x14, x12]\n"
+ "ldp x14, x13, [%x[inptrs], #0x40]\n"
+ "ldr q25, [x14, x12]\n"
+ "ldr q24, [x13, x12]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "ldr q23, [x10, x12]\n"
+ "ldr q22, [x9, x12]\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0x160]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0x170]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0x150]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [x28, x12]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldr q20, [x28, x12]\n"
+ "ldr q19, [x27, x12]\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldr q18, [x26, x12]\n"
+ "ldr q17, [x25, x12]\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str s2, [x24, x11]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "zip2 v2.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
+ "zip1 v4.16b, v5.16b, v3.16b\n"
+ "zip2 v3.16b, v5.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "zip2 v5.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
"add x11, x11, #0x4\n"
+ "zip1 v4.16b, v2.16b, v3.16b\n"
+ "zip2 v3.16b, v2.16b, v3.16b\n"
+ "ldr q2, [%x[params], #0x140]\n"
+ "add %x[params], %x[params], #0x180\n"
+ "zip2 v26.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "zip1 v31.16b, v0.16b, v30.16b\n"
+ "zip2 v30.16b, v0.16b, v30.16b\n"
+ "zip2 v21.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "zip1 v23.16b, v24.16b, v22.16b\n"
+ "zip2 v22.16b, v24.16b, v22.16b\n"
+ "zip2 v16.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v19.16b, v17.16b\n"
+ "zip2 v17.16b, v19.16b, v17.16b\n"
+ "zip2 v0.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "zip1 v31.16b, v26.16b, v30.16b\n"
+ "zip2 v30.16b, v26.16b, v30.16b\n"
+ "zip2 v24.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "zip1 v23.16b, v21.16b, v22.16b\n"
+ "zip2 v22.16b, v21.16b, v22.16b\n"
+ "zip2 v19.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v16.16b, v17.16b\n"
+ "zip2 v17.16b, v16.16b, v17.16b\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
"bgt 1b\n"
+ "2:" // Detached iteration
+ ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
"tst %x[n_channels], #0xf\n"
- "beq 34f\n"
- "2:" // Oddments
- "and x19, %x[n_channels], #0xf\n"
- "add x15, x15, x11\n"
- "add x14, x14, x11\n"
- "add x13, x13, x11\n"
- "add x12, x12, x11\n"
- "add x10, x10, x11\n"
- "add x9, x9, x11\n"
- "add x27, x27, x11\n"
- "add x26, x26, x11\n"
- "tbz %x[n_channels], #3, 6f\n"
- "ld1 { v27.d }[0], [x15], #0x8\n"
- "ld1 { v1.d }[0], [x14], #0x8\n"
- "ld1 { v25.d }[0], [x13], #0x8\n"
- "ld1 { v23.d }[0], [x12], #0x8\n"
- "ld1 { v31.d }[0], [x10], #0x8\n"
- "ld1 { v28.d }[0], [x9], #0x8\n"
- "ld1 { v21.d }[0], [x27], #0x8\n"
- "ld1 { v26.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 4f\n"
- "ld1 { v27.s }[2], [x15], #0x4\n"
- "ld1 { v1.s }[2], [x14], #0x4\n"
- "ld1 { v25.s }[2], [x13], #0x4\n"
- "ld1 { v23.s }[2], [x12], #0x4\n"
- "ld1 { v31.s }[2], [x10], #0x4\n"
- "ld1 { v28.s }[2], [x9], #0x4\n"
- "ld1 { v21.s }[2], [x27], #0x4\n"
- "ld1 { v26.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 3f\n"
- "ld1 { v27.h }[6], [x15], #0x2\n"
- "ld1 { v1.h }[6], [x14], #0x2\n"
- "ld1 { v25.h }[6], [x13], #0x2\n"
- "ld1 { v23.h }[6], [x12], #0x2\n"
- "ld1 { v31.h }[6], [x10], #0x2\n"
- "ld1 { v28.h }[6], [x9], #0x2\n"
- "ld1 { v21.h }[6], [x27], #0x2\n"
- "ld1 { v26.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[14], [x15], #0x1\n"
- "ld1 { v1.b }[14], [x14], #0x1\n"
- "ld1 { v25.b }[14], [x13], #0x1\n"
- "ld1 { v23.b }[14], [x12], #0x1\n"
- "ld1 { v31.b }[14], [x10], #0x1\n"
- "ld1 { v28.b }[14], [x9], #0x1\n"
- "ld1 { v21.b }[14], [x27], #0x1\n"
- "ld1 { v26.b }[14], [x26], #0x1\n"
- "b 10f\n"
- "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[12], [x15], #0x1\n"
- "ld1 { v1.b }[12], [x14], #0x1\n"
- "ld1 { v25.b }[12], [x13], #0x1\n"
- "ld1 { v23.b }[12], [x12], #0x1\n"
- "ld1 { v31.b }[12], [x10], #0x1\n"
- "ld1 { v28.b }[12], [x9], #0x1\n"
- "ld1 { v21.b }[12], [x27], #0x1\n"
- "ld1 { v26.b }[12], [x26], #0x1\n"
- "b 10f\n"
- "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 5f\n"
- "ld1 { v27.h }[4], [x15], #0x2\n"
- "ld1 { v1.h }[4], [x14], #0x2\n"
- "ld1 { v25.h }[4], [x13], #0x2\n"
- "ld1 { v23.h }[4], [x12], #0x2\n"
- "ld1 { v31.h }[4], [x10], #0x2\n"
- "ld1 { v28.h }[4], [x9], #0x2\n"
- "ld1 { v21.h }[4], [x27], #0x2\n"
- "ld1 { v26.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[10], [x15], #0x1\n"
- "ld1 { v1.b }[10], [x14], #0x1\n"
- "ld1 { v25.b }[10], [x13], #0x1\n"
- "ld1 { v23.b }[10], [x12], #0x1\n"
- "ld1 { v31.b }[10], [x10], #0x1\n"
- "ld1 { v28.b }[10], [x9], #0x1\n"
- "ld1 { v21.b }[10], [x27], #0x1\n"
- "ld1 { v26.b }[10], [x26], #0x1\n"
- "b 10f\n"
- "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[8], [x15], #0x1\n"
- "ld1 { v1.b }[8], [x14], #0x1\n"
- "ld1 { v25.b }[8], [x13], #0x1\n"
- "ld1 { v23.b }[8], [x12], #0x1\n"
- "ld1 { v31.b }[8], [x10], #0x1\n"
- "ld1 { v28.b }[8], [x9], #0x1\n"
- "ld1 { v21.b }[8], [x27], #0x1\n"
- "ld1 { v26.b }[8], [x26], #0x1\n"
- "b 10f\n"
- "6:" // Oddments: Load (A): Bit 3: Unset
- "tbz %x[n_channels], #2, 8f\n"
- "ld1 { v27.s }[0], [x15], #0x4\n"
- "ld1 { v1.s }[0], [x14], #0x4\n"
- "ld1 { v25.s }[0], [x13], #0x4\n"
- "ld1 { v23.s }[0], [x12], #0x4\n"
- "ld1 { v31.s }[0], [x10], #0x4\n"
- "ld1 { v28.s }[0], [x9], #0x4\n"
- "ld1 { v21.s }[0], [x27], #0x4\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 7f\n"
- "ld1 { v27.h }[2], [x15], #0x2\n"
- "ld1 { v1.h }[2], [x14], #0x2\n"
- "ld1 { v25.h }[2], [x13], #0x2\n"
- "ld1 { v23.h }[2], [x12], #0x2\n"
- "ld1 { v31.h }[2], [x10], #0x2\n"
- "ld1 { v28.h }[2], [x9], #0x2\n"
- "ld1 { v21.h }[2], [x27], #0x2\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[6], [x15], #0x1\n"
- "ld1 { v1.b }[6], [x14], #0x1\n"
- "ld1 { v25.b }[6], [x13], #0x1\n"
- "ld1 { v23.b }[6], [x12], #0x1\n"
- "ld1 { v31.b }[6], [x10], #0x1\n"
- "ld1 { v28.b }[6], [x9], #0x1\n"
- "ld1 { v21.b }[6], [x27], #0x1\n"
- "ld1 { v26.b }[6], [x26], #0x1\n"
- "b 10f\n"
- "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[4], [x15], #0x1\n"
- "ld1 { v1.b }[4], [x14], #0x1\n"
- "ld1 { v25.b }[4], [x13], #0x1\n"
- "ld1 { v23.b }[4], [x12], #0x1\n"
- "ld1 { v31.b }[4], [x10], #0x1\n"
- "ld1 { v28.b }[4], [x9], #0x1\n"
- "ld1 { v21.b }[4], [x27], #0x1\n"
- "ld1 { v26.b }[4], [x26], #0x1\n"
- "b 10f\n"
- "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 9f\n"
- "ld1 { v27.h }[0], [x15], #0x2\n"
- "ld1 { v1.h }[0], [x14], #0x2\n"
- "ld1 { v25.h }[0], [x13], #0x2\n"
- "ld1 { v23.h }[0], [x12], #0x2\n"
- "ld1 { v31.h }[0], [x10], #0x2\n"
- "ld1 { v28.h }[0], [x9], #0x2\n"
- "ld1 { v21.h }[0], [x27], #0x2\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[2], [x15], #0x1\n"
- "ld1 { v1.b }[2], [x14], #0x1\n"
- "ld1 { v25.b }[2], [x13], #0x1\n"
- "ld1 { v23.b }[2], [x12], #0x1\n"
- "ld1 { v31.b }[2], [x10], #0x1\n"
- "ld1 { v28.b }[2], [x9], #0x1\n"
- "ld1 { v21.b }[2], [x27], #0x1\n"
- "ld1 { v26.b }[2], [x26], #0x1\n"
- "b 10f\n"
- "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[0], [x15], #0x1\n"
- "ld1 { v1.b }[0], [x14], #0x1\n"
- "ld1 { v25.b }[0], [x13], #0x1\n"
- "ld1 { v23.b }[0], [x12], #0x1\n"
- "ld1 { v31.b }[0], [x10], #0x1\n"
- "ld1 { v28.b }[0], [x9], #0x1\n"
- "ld1 { v21.b }[0], [x27], #0x1\n"
- "ld1 { v26.b }[0], [x26], #0x1\n"
- "10:" // Oddments: Load (A): Bit 3: End
- "ldp x15, x14, [%x[inptrs], #0x40]\n"
- "add x15, x15, x11\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "add x14, x14, x11\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "add x13, x13, x11\n"
- "add x12, x12, x11\n"
- "add x10, x10, x11\n"
- "add x9, x9, x11\n"
- "add x27, x27, x11\n"
- "add x26, x26, x11\n"
- "tbz %x[n_channels], #3, 14f\n"
- "ld1 { v24.d }[0], [x15], #0x8\n"
- "ld1 { v22.d }[0], [x14], #0x8\n"
- "ld1 { v20.d }[0], [x13], #0x8\n"
- "ld1 { v16.d }[0], [x12], #0x8\n"
- "ld1 { v19.d }[0], [x10], #0x8\n"
- "ld1 { v0.d }[0], [x9], #0x8\n"
- "ld1 { v18.d }[0], [x27], #0x8\n"
- "ld1 { v17.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 12f\n"
- "ld1 { v24.s }[2], [x15], #0x4\n"
- "ld1 { v22.s }[2], [x14], #0x4\n"
- "ld1 { v20.s }[2], [x13], #0x4\n"
- "ld1 { v16.s }[2], [x12], #0x4\n"
- "ld1 { v19.s }[2], [x10], #0x4\n"
- "ld1 { v0.s }[2], [x9], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v17.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 11f\n"
- "ld1 { v24.h }[6], [x15], #0x2\n"
- "ld1 { v22.h }[6], [x14], #0x2\n"
- "ld1 { v20.h }[6], [x13], #0x2\n"
- "ld1 { v16.h }[6], [x12], #0x2\n"
- "ld1 { v19.h }[6], [x10], #0x2\n"
- "ld1 { v0.h }[6], [x9], #0x2\n"
- "ld1 { v18.h }[6], [x27], #0x2\n"
- "ld1 { v17.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[14], [x15], #0x1\n"
- "ld1 { v22.b }[14], [x14], #0x1\n"
- "ld1 { v20.b }[14], [x13], #0x1\n"
- "ld1 { v16.b }[14], [x12], #0x1\n"
- "ld1 { v19.b }[14], [x10], #0x1\n"
- "ld1 { v0.b }[14], [x9], #0x1\n"
- "ld1 { v18.b }[14], [x27], #0x1\n"
- "ld1 { v17.b }[14], [x26], #0x1\n"
- "b 18f\n"
- "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[12], [x15], #0x1\n"
- "ld1 { v22.b }[12], [x14], #0x1\n"
- "ld1 { v20.b }[12], [x13], #0x1\n"
- "ld1 { v16.b }[12], [x12], #0x1\n"
- "ld1 { v19.b }[12], [x10], #0x1\n"
- "ld1 { v0.b }[12], [x9], #0x1\n"
- "ld1 { v18.b }[12], [x27], #0x1\n"
- "ld1 { v17.b }[12], [x26], #0x1\n"
- "b 18f\n"
- "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 13f\n"
- "ld1 { v24.h }[4], [x15], #0x2\n"
- "ld1 { v22.h }[4], [x14], #0x2\n"
- "ld1 { v20.h }[4], [x13], #0x2\n"
- "ld1 { v16.h }[4], [x12], #0x2\n"
- "ld1 { v19.h }[4], [x10], #0x2\n"
- "ld1 { v0.h }[4], [x9], #0x2\n"
- "ld1 { v18.h }[4], [x27], #0x2\n"
- "ld1 { v17.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[10], [x15], #0x1\n"
- "ld1 { v22.b }[10], [x14], #0x1\n"
- "ld1 { v20.b }[10], [x13], #0x1\n"
- "ld1 { v16.b }[10], [x12], #0x1\n"
- "ld1 { v19.b }[10], [x10], #0x1\n"
- "ld1 { v0.b }[10], [x9], #0x1\n"
- "ld1 { v18.b }[10], [x27], #0x1\n"
- "ld1 { v17.b }[10], [x26], #0x1\n"
- "b 18f\n"
- "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[8], [x15], #0x1\n"
- "ld1 { v22.b }[8], [x14], #0x1\n"
- "ld1 { v20.b }[8], [x13], #0x1\n"
- "ld1 { v16.b }[8], [x12], #0x1\n"
- "ld1 { v19.b }[8], [x10], #0x1\n"
- "ld1 { v0.b }[8], [x9], #0x1\n"
- "ld1 { v18.b }[8], [x27], #0x1\n"
- "ld1 { v17.b }[8], [x26], #0x1\n"
- "b 18f\n"
- "14:" // Oddments: Load (B): Bit 3: Unset
- "tbz %x[n_channels], #2, 16f\n"
- "ld1 { v24.s }[0], [x15], #0x4\n"
- "ld1 { v22.s }[0], [x14], #0x4\n"
- "ld1 { v20.s }[0], [x13], #0x4\n"
- "ld1 { v16.s }[0], [x12], #0x4\n"
- "ld1 { v19.s }[0], [x10], #0x4\n"
- "ld1 { v0.s }[0], [x9], #0x4\n"
- "ld1 { v18.s }[0], [x27], #0x4\n"
- "ld1 { v17.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 15f\n"
- "ld1 { v24.h }[2], [x15], #0x2\n"
- "ld1 { v22.h }[2], [x14], #0x2\n"
- "ld1 { v20.h }[2], [x13], #0x2\n"
- "ld1 { v16.h }[2], [x12], #0x2\n"
- "ld1 { v19.h }[2], [x10], #0x2\n"
- "ld1 { v0.h }[2], [x9], #0x2\n"
- "ld1 { v18.h }[2], [x27], #0x2\n"
- "ld1 { v17.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[6], [x15], #0x1\n"
- "ld1 { v22.b }[6], [x14], #0x1\n"
- "ld1 { v20.b }[6], [x13], #0x1\n"
- "ld1 { v16.b }[6], [x12], #0x1\n"
- "ld1 { v19.b }[6], [x10], #0x1\n"
- "ld1 { v0.b }[6], [x9], #0x1\n"
- "ld1 { v18.b }[6], [x27], #0x1\n"
- "ld1 { v17.b }[6], [x26], #0x1\n"
- "b 18f\n"
- "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[4], [x15], #0x1\n"
- "ld1 { v22.b }[4], [x14], #0x1\n"
- "ld1 { v20.b }[4], [x13], #0x1\n"
- "ld1 { v16.b }[4], [x12], #0x1\n"
- "ld1 { v19.b }[4], [x10], #0x1\n"
- "ld1 { v0.b }[4], [x9], #0x1\n"
- "ld1 { v18.b }[4], [x27], #0x1\n"
- "ld1 { v17.b }[4], [x26], #0x1\n"
- "b 18f\n"
- "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 17f\n"
- "ld1 { v24.h }[0], [x15], #0x2\n"
- "ld1 { v22.h }[0], [x14], #0x2\n"
- "ld1 { v20.h }[0], [x13], #0x2\n"
- "ld1 { v16.h }[0], [x12], #0x2\n"
- "ld1 { v19.h }[0], [x10], #0x2\n"
- "ld1 { v0.h }[0], [x9], #0x2\n"
- "ld1 { v18.h }[0], [x27], #0x2\n"
- "ld1 { v17.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[2], [x15], #0x1\n"
- "ld1 { v22.b }[2], [x14], #0x1\n"
- "ld1 { v20.b }[2], [x13], #0x1\n"
- "ld1 { v16.b }[2], [x12], #0x1\n"
- "ld1 { v19.b }[2], [x10], #0x1\n"
- "ld1 { v0.b }[2], [x9], #0x1\n"
- "ld1 { v18.b }[2], [x27], #0x1\n"
- "ld1 { v17.b }[2], [x26], #0x1\n"
- "b 18f\n"
- "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[0], [x15], #0x1\n"
- "ld1 { v22.b }[0], [x14], #0x1\n"
- "ld1 { v20.b }[0], [x13], #0x1\n"
- "ld1 { v16.b }[0], [x12], #0x1\n"
- "ld1 { v19.b }[0], [x10], #0x1\n"
- "ld1 { v0.b }[0], [x9], #0x1\n"
- "ld1 { v18.b }[0], [x27], #0x1\n"
- "ld1 { v17.b }[0], [x26], #0x1\n"
- "18:" // Oddments: Load (B): Bit 3: End
- "zip1 v6.16b, v27.16b, v25.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "cmp x19, #0x4\n"
- "zip2 v9.16b, v27.16b, v25.16b\n"
+ ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
+ "ldr q6, [%x[params], #0x0]\n"
+ ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
+ ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ "add x12, x12, #0x10\n"
+ ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0x60]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0x40]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0x70]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0x20]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "str s16, [x21, x11]\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
+ ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0xc0]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0xb0]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0x90]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
+ ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "add x11, x11, #0x4\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
+ ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
+ ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
+ ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "ldr q6, [%x[params], #0x120]\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "ldr q28, [%x[params], #0x100]\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "ldr q29, [%x[params], #0xf0]\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "ldr q1, [%x[params], #0x130]\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x24, x11]\n"
+ "ldr q2, [%x[params], #0xe0]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s26, [x23, x11]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s21, [x22, x11]\n"
+ "mov v26.16b, v2.16b\n"
+ "str s16, [x21, x11]\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
+ ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
+ ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
+ ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s2, [x24, x11]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s26, [x23, x11]\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "add x11, x11, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x14, x14, x12\n"
+ "add x13, x13, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
+ "add x27, x27, x12\n"
+ "add x26, x26, x12\n"
+ "add x25, x25, x12\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d6, [x14], #0x8\n"
+ "ldr d5, [x13], #0x8\n"
+ "ldr d4, [x10], #0x8\n"
+ "ldr d3, [x9], #0x8\n"
+ "ldr d1, [x28], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d30, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v6.s }[2], [x14], #0x4\n"
+ "ld1 { v5.s }[2], [x13], #0x4\n"
+ "ld1 { v4.s }[2], [x10], #0x4\n"
+ "ld1 { v3.s }[2], [x9], #0x4\n"
+ "ld1 { v1.s }[2], [x28], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v30.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v6.h }[6], [x14], #0x2\n"
+ "ld1 { v5.h }[6], [x13], #0x2\n"
+ "ld1 { v4.h }[6], [x10], #0x2\n"
+ "ld1 { v3.h }[6], [x9], #0x2\n"
+ "ld1 { v1.h }[6], [x28], #0x2\n"
+ "ld1 { v0.h }[6], [x27], #0x2\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v30.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[14], [x14], #0x1\n"
+ "ld1 { v5.b }[14], [x13], #0x1\n"
+ "ld1 { v4.b }[14], [x10], #0x1\n"
+ "ld1 { v3.b }[14], [x9], #0x1\n"
+ "ld1 { v1.b }[14], [x28], #0x1\n"
+ "ld1 { v0.b }[14], [x27], #0x1\n"
+ "ld1 { v31.b }[14], [x26], #0x1\n"
+ "ld1 { v30.b }[14], [x25], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[12], [x14], #0x1\n"
+ "ld1 { v5.b }[12], [x13], #0x1\n"
+ "ld1 { v4.b }[12], [x10], #0x1\n"
+ "ld1 { v3.b }[12], [x9], #0x1\n"
+ "ld1 { v1.b }[12], [x28], #0x1\n"
+ "ld1 { v0.b }[12], [x27], #0x1\n"
+ "ld1 { v31.b }[12], [x26], #0x1\n"
+ "ld1 { v30.b }[12], [x25], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v6.h }[4], [x14], #0x2\n"
+ "ld1 { v5.h }[4], [x13], #0x2\n"
+ "ld1 { v4.h }[4], [x10], #0x2\n"
+ "ld1 { v3.h }[4], [x9], #0x2\n"
+ "ld1 { v1.h }[4], [x28], #0x2\n"
+ "ld1 { v0.h }[4], [x27], #0x2\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v30.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[10], [x14], #0x1\n"
+ "ld1 { v5.b }[10], [x13], #0x1\n"
+ "ld1 { v4.b }[10], [x10], #0x1\n"
+ "ld1 { v3.b }[10], [x9], #0x1\n"
+ "ld1 { v1.b }[10], [x28], #0x1\n"
+ "ld1 { v0.b }[10], [x27], #0x1\n"
+ "ld1 { v31.b }[10], [x26], #0x1\n"
+ "ld1 { v30.b }[10], [x25], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[8], [x14], #0x1\n"
+ "ld1 { v5.b }[8], [x13], #0x1\n"
+ "ld1 { v4.b }[8], [x10], #0x1\n"
+ "ld1 { v3.b }[8], [x9], #0x1\n"
+ "ld1 { v1.b }[8], [x28], #0x1\n"
+ "ld1 { v0.b }[8], [x27], #0x1\n"
+ "ld1 { v31.b }[8], [x26], #0x1\n"
+ "ld1 { v30.b }[8], [x25], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s6, [x14], #0x4\n"
+ "ldr s5, [x13], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s30, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v6.h }[2], [x14], #0x2\n"
+ "ld1 { v5.h }[2], [x13], #0x2\n"
+ "ld1 { v4.h }[2], [x10], #0x2\n"
+ "ld1 { v3.h }[2], [x9], #0x2\n"
+ "ld1 { v1.h }[2], [x28], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[6], [x14], #0x1\n"
+ "ld1 { v5.b }[6], [x13], #0x1\n"
+ "ld1 { v4.b }[6], [x10], #0x1\n"
+ "ld1 { v3.b }[6], [x9], #0x1\n"
+ "ld1 { v1.b }[6], [x28], #0x1\n"
+ "ld1 { v0.b }[6], [x27], #0x1\n"
+ "ld1 { v31.b }[6], [x26], #0x1\n"
+ "ld1 { v30.b }[6], [x25], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[4], [x14], #0x1\n"
+ "ld1 { v5.b }[4], [x13], #0x1\n"
+ "ld1 { v4.b }[4], [x10], #0x1\n"
+ "ld1 { v3.b }[4], [x9], #0x1\n"
+ "ld1 { v1.b }[4], [x28], #0x1\n"
+ "ld1 { v0.b }[4], [x27], #0x1\n"
+ "ld1 { v31.b }[4], [x26], #0x1\n"
+ "ld1 { v30.b }[4], [x25], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h6, [x14], #0x2\n"
+ "ldr h5, [x13], #0x2\n"
+ "ldr h4, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h0, [x27], #0x2\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h30, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v6.b }[2], [x14], #0x1\n"
+ "ld1 { v5.b }[2], [x13], #0x1\n"
+ "ld1 { v4.b }[2], [x10], #0x1\n"
+ "ld1 { v3.b }[2], [x9], #0x1\n"
+ "ld1 { v1.b }[2], [x28], #0x1\n"
+ "ld1 { v0.b }[2], [x27], #0x1\n"
+ "ld1 { v31.b }[2], [x26], #0x1\n"
+ "ld1 { v30.b }[2], [x25], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b6, [x14], #0x1\n"
+ "ldr b5, [x13], #0x1\n"
+ "ldr b4, [x10], #0x1\n"
+ "ldr b3, [x9], #0x1\n"
+ "ldr b1, [x28], #0x1\n"
+ "ldr b0, [x27], #0x1\n"
+ "ldr b31, [x26], #0x1\n"
+ "ldr b30, [x25], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x14, x13, [%x[inptrs], #0x40]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "add x14, x14, x12\n"
+ "add x13, x13, x12\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
+ "add x27, x27, x12\n"
+ "add x26, x26, x12\n"
+ "add x25, x25, x12\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d25, [x14], #0x8\n"
+ "ldr d24, [x13], #0x8\n"
+ "ldr d23, [x10], #0x8\n"
+ "ldr d22, [x9], #0x8\n"
+ "ldr d20, [x28], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d18, [x26], #0x8\n"
+ "ldr d17, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
+ "ld1 { v24.s }[2], [x13], #0x4\n"
+ "ld1 { v23.s }[2], [x10], #0x4\n"
+ "ld1 { v22.s }[2], [x9], #0x4\n"
+ "ld1 { v20.s }[2], [x28], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v18.s }[2], [x26], #0x4\n"
+ "ld1 { v17.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
+ "ld1 { v24.h }[6], [x13], #0x2\n"
+ "ld1 { v23.h }[6], [x10], #0x2\n"
+ "ld1 { v22.h }[6], [x9], #0x2\n"
+ "ld1 { v20.h }[6], [x28], #0x2\n"
+ "ld1 { v19.h }[6], [x27], #0x2\n"
+ "ld1 { v18.h }[6], [x26], #0x2\n"
+ "ld1 { v17.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
+ "ld1 { v24.b }[14], [x13], #0x1\n"
+ "ld1 { v23.b }[14], [x10], #0x1\n"
+ "ld1 { v22.b }[14], [x9], #0x1\n"
+ "ld1 { v20.b }[14], [x28], #0x1\n"
+ "ld1 { v19.b }[14], [x27], #0x1\n"
+ "ld1 { v18.b }[14], [x26], #0x1\n"
+ "ld1 { v17.b }[14], [x25], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
+ "ld1 { v24.b }[12], [x13], #0x1\n"
+ "ld1 { v23.b }[12], [x10], #0x1\n"
+ "ld1 { v22.b }[12], [x9], #0x1\n"
+ "ld1 { v20.b }[12], [x28], #0x1\n"
+ "ld1 { v19.b }[12], [x27], #0x1\n"
+ "ld1 { v18.b }[12], [x26], #0x1\n"
+ "ld1 { v17.b }[12], [x25], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
+ "ld1 { v24.h }[4], [x13], #0x2\n"
+ "ld1 { v23.h }[4], [x10], #0x2\n"
+ "ld1 { v22.h }[4], [x9], #0x2\n"
+ "ld1 { v20.h }[4], [x28], #0x2\n"
+ "ld1 { v19.h }[4], [x27], #0x2\n"
+ "ld1 { v18.h }[4], [x26], #0x2\n"
+ "ld1 { v17.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
+ "ld1 { v24.b }[10], [x13], #0x1\n"
+ "ld1 { v23.b }[10], [x10], #0x1\n"
+ "ld1 { v22.b }[10], [x9], #0x1\n"
+ "ld1 { v20.b }[10], [x28], #0x1\n"
+ "ld1 { v19.b }[10], [x27], #0x1\n"
+ "ld1 { v18.b }[10], [x26], #0x1\n"
+ "ld1 { v17.b }[10], [x25], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
+ "ld1 { v24.b }[8], [x13], #0x1\n"
+ "ld1 { v23.b }[8], [x10], #0x1\n"
+ "ld1 { v22.b }[8], [x9], #0x1\n"
+ "ld1 { v20.b }[8], [x28], #0x1\n"
+ "ld1 { v19.b }[8], [x27], #0x1\n"
+ "ld1 { v18.b }[8], [x26], #0x1\n"
+ "ld1 { v17.b }[8], [x25], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s25, [x14], #0x4\n"
+ "ldr s24, [x13], #0x4\n"
+ "ldr s23, [x10], #0x4\n"
+ "ldr s22, [x9], #0x4\n"
+ "ldr s20, [x28], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "ldr s17, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "ld1 { v23.h }[2], [x10], #0x2\n"
+ "ld1 { v22.h }[2], [x9], #0x2\n"
+ "ld1 { v20.h }[2], [x28], #0x2\n"
+ "ld1 { v19.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v17.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
+ "ld1 { v24.b }[6], [x13], #0x1\n"
+ "ld1 { v23.b }[6], [x10], #0x1\n"
+ "ld1 { v22.b }[6], [x9], #0x1\n"
+ "ld1 { v20.b }[6], [x28], #0x1\n"
+ "ld1 { v19.b }[6], [x27], #0x1\n"
+ "ld1 { v18.b }[6], [x26], #0x1\n"
+ "ld1 { v17.b }[6], [x25], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
+ "ld1 { v24.b }[4], [x13], #0x1\n"
+ "ld1 { v23.b }[4], [x10], #0x1\n"
+ "ld1 { v22.b }[4], [x9], #0x1\n"
+ "ld1 { v20.b }[4], [x28], #0x1\n"
+ "ld1 { v19.b }[4], [x27], #0x1\n"
+ "ld1 { v18.b }[4], [x26], #0x1\n"
+ "ld1 { v17.b }[4], [x25], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h25, [x14], #0x2\n"
+ "ldr h24, [x13], #0x2\n"
+ "ldr h23, [x10], #0x2\n"
+ "ldr h22, [x9], #0x2\n"
+ "ldr h20, [x28], #0x2\n"
+ "ldr h19, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h17, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
+ "ld1 { v24.b }[2], [x13], #0x1\n"
+ "ld1 { v23.b }[2], [x10], #0x1\n"
+ "ld1 { v22.b }[2], [x9], #0x1\n"
+ "ld1 { v20.b }[2], [x28], #0x1\n"
+ "ld1 { v19.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v17.b }[2], [x25], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b25, [x14], #0x1\n"
+ "ldr b24, [x13], #0x1\n"
+ "ldr b23, [x10], #0x1\n"
+ "ldr b22, [x9], #0x1\n"
+ "ldr b20, [x28], #0x1\n"
+ "ldr b19, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b17, [x25], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
"ldr q29, [%x[params], #0x10]\n"
- "zip1 v5.16b, v1.16b, v23.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "ldr q28, [%x[params], #0x20]\n"
+ "zip2 v2.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
+ "ldr q27, [%x[params], #0x30]\n"
+ "zip1 v4.16b, v5.16b, v3.16b\n"
+ "zip2 v3.16b, v5.16b, v3.16b\n"
+ "cmp x20, #0x4\n"
+ "zip2 v5.16b, v6.16b, v4.16b\n"
+ "zip1 v6.16b, v6.16b, v4.16b\n"
+ "zip1 v4.16b, v2.16b, v3.16b\n"
+ "zip2 v3.16b, v2.16b, v3.16b\n"
+ "ldr q2, [%x[params], #0x0]\n"
+ "zip2 v26.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "zip1 v31.16b, v0.16b, v30.16b\n"
+ "zip2 v30.16b, v0.16b, v30.16b\n"
+ "zip2 v21.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "zip1 v23.16b, v24.16b, v22.16b\n"
+ "zip2 v22.16b, v24.16b, v22.16b\n"
+ "zip2 v16.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v19.16b, v17.16b\n"
+ "zip2 v17.16b, v19.16b, v17.16b\n"
+ "zip2 v0.16b, v1.16b, v31.16b\n"
+ "zip1 v1.16b, v1.16b, v31.16b\n"
+ "zip1 v31.16b, v26.16b, v30.16b\n"
+ "zip2 v30.16b, v26.16b, v30.16b\n"
+ "zip2 v24.16b, v25.16b, v23.16b\n"
+ "zip1 v25.16b, v25.16b, v23.16b\n"
+ "zip1 v23.16b, v21.16b, v22.16b\n"
+ "zip2 v22.16b, v21.16b, v22.16b\n"
+ "zip2 v19.16b, v20.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v18.16b\n"
+ "zip1 v18.16b, v16.16b, v17.16b\n"
+ "zip2 v17.16b, v16.16b, v17.16b\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n"
+ ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n"
+ "ldr q1, [%x[params], #0x50]\n"
+ ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n"
+ ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
- "zip1 v8.16b, v6.16b, v5.16b\n"
- "zip2 v7.16b, v6.16b, v5.16b\n"
- "zip1 v6.16b, v9.16b, v3.16b\n"
- "str q6, [SP, #0x0]\n"
- "zip2 v5.16b, v9.16b, v3.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 19f\n"
- "str s30, [x24, x11]\n"
- "str s22, [x23, x11]\n"
- "str s20, [x21, x11]\n"
- "str s19, [x20, x11]\n"
- "b 22f\n"
- "19:" // Oddments: Unroll 0: Oddment store
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "blt 20f\n"
+ "str s2, [x24, x11]\n"
+ "str s26, [x23, x11]\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
"add x24, x24, x11\n"
"add x23, x23, x11\n"
+ "add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "tbz x19, #1, 20f\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v22.h }[0], [x23], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
- "st1 { v19.h }[0], [x20], #0x2\n"
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v22.b }[2], [x23], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "b 21f\n"
- "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v22.b }[0], [x23], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
-
- "22:" // Oddments: Unroll 0: After oddment store
+ "tbz x20, #1, 21f\n"
+ "st1 { v2.h }[0], [x24], #0x2\n"
+ "st1 { v26.h }[0], [x23], #0x2\n"
+ "st1 { v21.h }[0], [x22], #0x2\n"
+ "st1 { v16.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v2.b }[2], [x24], #0x1\n"
+ "st1 { v26.b }[2], [x23], #0x1\n"
+ "st1 { v21.b }[2], [x22], #0x1\n"
+ "st1 { v16.b }[2], [x21], #0x1\n"
+ "b 22f\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v2.b }[0], [x24], #0x1\n"
+ "st1 { v26.b }[0], [x23], #0x1\n"
+ "st1 { v21.b }[0], [x22], #0x1\n"
+ "st1 { v16.b }[0], [x21], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "ldr q30, [%x[params], #0x0]\n"
- "mov v22.16b, v30.16b\n"
+ "ble 35f\n"
+ "ldr q2, [%x[params], #0x0]\n"
"ldr q29, [%x[params], #0x10]\n"
- "cmp x19, #0x4\n"
- "mov v20.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "ldr q23, [%x[params], #0x40]\n"
- ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "ldr q28, [%x[params], #0x20]\n"
+ "ldr q27, [%x[params], #0x30]\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "ldr q1, [%x[params], #0x50]\n"
+ ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n"
"ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 23f\n"
- "str s30, [x24, x11]\n"
- "str s22, [x23, x11]\n"
- "str s20, [x21, x11]\n"
- "str s19, [x20, x11]\n"
- "b 26f\n"
- "23:" // Oddments: Unroll 1: Oddment store
+ ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n"
+ ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "blt 24f\n"
+ "str s2, [x24, x11]\n"
+ "str s26, [x23, x11]\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
"add x24, x24, x11\n"
"add x23, x23, x11\n"
+ "add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "tbz x19, #1, 24f\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v22.h }[0], [x23], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
- "st1 { v19.h }[0], [x20], #0x2\n"
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v22.b }[2], [x23], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "b 25f\n"
- "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v22.b }[0], [x23], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
-
- "26:" // Oddments: Unroll 1: After oddment store
+ "tbz x20, #1, 25f\n"
+ "st1 { v2.h }[0], [x24], #0x2\n"
+ "st1 { v26.h }[0], [x23], #0x2\n"
+ "st1 { v21.h }[0], [x22], #0x2\n"
+ "st1 { v16.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v2.b }[2], [x24], #0x1\n"
+ "st1 { v26.b }[2], [x23], #0x1\n"
+ "st1 { v21.b }[2], [x22], #0x1\n"
+ "st1 { v16.b }[2], [x21], #0x1\n"
+ "b 26f\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v2.b }[0], [x24], #0x1\n"
+ "st1 { v26.b }[0], [x23], #0x1\n"
+ "st1 { v21.b }[0], [x22], #0x1\n"
+ "st1 { v16.b }[0], [x21], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "ldr q8, [SP, #0x0]\n"
- "ldr q3, [SP, #0x20]\n"
- "cmp x19, #0x4\n"
- "ldr q31, [SP, #0x40]\n"
- "ldr q26, [SP, #0x60]\n"
- "ldr q30, [%x[params], #0x0]\n"
- "mov v22.16b, v30.16b\n"
+ "ble 35f\n"
+ "ldr q2, [%x[params], #0x0]\n"
"ldr q29, [%x[params], #0x10]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "ldr q23, [%x[params], #0x40]\n"
- ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
- ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "ldr q28, [%x[params], #0x20]\n"
+ "ldr q27, [%x[params], #0x30]\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "ldr q1, [%x[params], #0x50]\n"
+ ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n"
+ ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
"ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
- ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
- ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 27f\n"
- "str s30, [x24, x11]\n"
- "str s22, [x23, x11]\n"
- "str s20, [x21, x11]\n"
- "str s19, [x20, x11]\n"
- "b 30f\n"
- "27:" // Oddments: Unroll 2: Oddment store
+ ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n"
+ ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n"
+ ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n"
+ ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "blt 28f\n"
+ "str s2, [x24, x11]\n"
+ "str s26, [x23, x11]\n"
+ "str s21, [x22, x11]\n"
+ "str s16, [x21, x11]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
"add x24, x24, x11\n"
"add x23, x23, x11\n"
+ "add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "tbz x19, #1, 28f\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v22.h }[0], [x23], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
- "st1 { v19.h }[0], [x20], #0x2\n"
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v22.b }[2], [x23], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "b 29f\n"
- "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v22.b }[0], [x23], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "tbz x20, #1, 29f\n"
+ "st1 { v2.h }[0], [x24], #0x2\n"
+ "st1 { v26.h }[0], [x23], #0x2\n"
+ "st1 { v21.h }[0], [x22], #0x2\n"
+ "st1 { v16.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v2.b }[2], [x24], #0x1\n"
+ "st1 { v26.b }[2], [x23], #0x1\n"
+ "st1 { v21.b }[2], [x22], #0x1\n"
+ "st1 { v16.b }[2], [x21], #0x1\n"
+ "b 30f\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v2.b }[0], [x24], #0x1\n"
+ "st1 { v26.b }[0], [x23], #0x1\n"
+ "st1 { v21.b }[0], [x22], #0x1\n"
+ "st1 { v16.b }[0], [x21], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
- "30:" // Oddments: Unroll 2: After oddment store
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
"add x11, x11, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "ldr q7, [SP, #0x10]\n"
- "ldr q2, [SP, #0x30]\n"
- "ldr q28, [SP, #0x50]\n"
- "ldr q24, [SP, #0x70]\n"
- "ldr q30, [%x[params], #0x0]\n"
- "mov v22.16b, v30.16b\n"
+ "ble 35f\n"
+ "ldr q2, [%x[params], #0x0]\n"
"ldr q29, [%x[params], #0x10]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "ldr q23, [%x[params], #0x40]\n"
- ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "mov v26.16b, v2.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "ldr q28, [%x[params], #0x20]\n"
+ "ldr q27, [%x[params], #0x30]\n"
+ "mov v16.16b, v2.16b\n"
+ ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "ldr q1, [%x[params], #0x50]\n"
+ ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
- ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
- ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
- "and v16.16b, v30.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "and v18.16b, v20.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v18.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "31:" // Oddments: Unroll 3: Oddment store
+ ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n"
+ ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n"
+ ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n"
+ ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n"
+ ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "sqrdmulh v2.4s, v2.4s, v6.4s\n"
+ ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n"
+ ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n"
+ "and v29.16b, v2.16b, v1.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sqadd v2.4s, v2.4s, v29.4s\n"
+ "and v28.16b, v26.16b, v1.16b\n"
+ "and v27.16b, v21.16b, v1.16b\n"
+ "and v29.16b, v16.16b, v1.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "srshl v2.4s, v2.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v7.4s\n"
+ "add v21.4s, v21.4s, v7.4s\n"
+ "add v16.4s, v16.4s, v7.4s\n"
+ "smax v2.4s, v2.4s, v9.4s\n"
+ "smax v26.4s, v26.4s, v9.4s\n"
+ "smax v21.4s, v21.4s, v9.4s\n"
+ "smax v16.4s, v16.4s, v9.4s\n"
+ "smin v2.4s, v2.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v8.4s\n"
+ "smin v21.4s, v21.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v8.4s\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "32:" // Oddments: Unroll 3: Oddment store
"add x24, x24, x11\n"
"add x23, x23, x11\n"
+ "add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add x20, x20, x11\n"
- "tbz x19, #1, 32f\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v22.h }[0], [x23], #0x2\n"
- "st1 { v20.h }[0], [x21], #0x2\n"
- "st1 { v19.h }[0], [x20], #0x2\n"
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v22.b }[2], [x23], #0x1\n"
- "st1 { v20.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "b 33f\n"
- "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v22.b }[0], [x23], #0x1\n"
- "st1 { v20.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "tbz x20, #1, 33f\n"
+ "st1 { v2.h }[0], [x24], #0x2\n"
+ "st1 { v26.h }[0], [x23], #0x2\n"
+ "st1 { v21.h }[0], [x22], #0x2\n"
+ "st1 { v16.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v2.b }[2], [x24], #0x1\n"
+ "st1 { v26.b }[2], [x23], #0x1\n"
+ "st1 { v21.b }[2], [x22], #0x1\n"
+ "st1 { v16.b }[2], [x21], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v2.b }[0], [x24], #0x1\n"
+ "st1 { v26.b }[0], [x23], #0x1\n"
+ "st1 { v21.b }[0], [x22], #0x1\n"
+ "st1 { v16.b }[0], [x21], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
- "34:" // End
- "add SP, SP, #0x80\n"
+ "35:" // End
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 64b305c21d..986937f3b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,1282 +41,1622 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x13, x12, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "ldp x11, x10, [%x[inptrs], #0x10]\n"
- "mov x19, #0x1\n"
- "ldp x9, x28, [%x[inptrs], #0x20]\n"
- "orr x19, x19, #0x100\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "orr x19, x19, #0x10000\n"
- "dup v11.4s, w19\n"
- "ldp x25, x24, [%x[outptrs], #0x0]\n"
- "mov x23, #0x0\n"
+ "mov x20, #0x1\n"
+ "orr x20, x20, #0x100\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "orr x20, x20, #0x10000\n"
+ "lsr x11, %x[n_channels], #0x4\n"
+ "dup v14.4s, w20\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "ldp x24, x23, [%x[outptrs], #0x0]\n"
"ldp x22, x21, [%x[outptrs], #0x10]\n"
- "lsr x20, %x[n_channels], #0x4\n"
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v9.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.4s }, [x19]\n"
- "cbz x20, 2f\n"
+ "cbz x11, 3f\n"
+ "ldr q9, [x15, x28]\n"
+ "ldr q8, [x14, x28]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q7, [x13, x28]\n"
+ "ldr q6, [x12, x28]\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "ldr q4, [x10, x28]\n"
+ "ldr q3, [x9, x28]\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
+ "ldr q2, [x26, x28]\n"
+ "ldr q1, [x25, x28]\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldr q29, [x15, x28]\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "ldr q28, [x14, x28]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldr q27, [x13, x28]\n"
+ "ldr q26, [x12, x28]\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldr q24, [x10, x28]\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "ldr q23, [x9, x28]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "ldr q22, [x26, x28]\n"
+ "ldr q21, [x25, x28]\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ "beq 2f\n"
"1:" // Loop
- "movi v15.4s, #0x0\n"
- "ldr q27, [x13, x23]\n"
- "subs x20, x20, #0x1\n"
- "movi v10.4s, #0x0\n"
- "ldr q1, [x12, x23]\n"
- "ldp x13, x12, [%x[inptrs], #0x40]\n"
- "ldr q25, [x11, x23]\n"
- "zip1 v7.16b, v27.16b, v25.16b\n"
- "ldr q23, [x10, x23]\n"
- "zip2 v5.16b, v27.16b, v25.16b\n"
- "ldp x11, x10, [%x[inptrs], #0x50]\n"
- "ldr q31, [x9, x23]\n"
- "zip1 v8.16b, v1.16b, v23.16b\n"
- "ldr q28, [x28, x23]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldp x9, x28, [%x[inptrs], #0x60]\n"
- "zip1 v6.16b, v7.16b, v8.16b\n"
- "ldr q21, [x27, x23]\n"
- "zip2 v8.16b, v7.16b, v8.16b\n"
- "ldr q26, [x26, x23]\n"
- "zip1 v7.16b, v5.16b, v3.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "zip2 v5.16b, v5.16b, v3.16b\n"
- "ldr q24, [x13, x23]\n"
- "ldr q22, [x12, x23]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldp x13, x12, [%x[inptrs], #0x0]\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "ldr q20, [x11, x23]\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "ldr q16, [x10, x23]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "ldp x11, x10, [%x[inptrs], #0x10]\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "ldr q19, [x9, x23]\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "ldr q0, [x28, x23]\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "ldp x9, x28, [%x[inptrs], #0x20]\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "ldr q18, [x27, x23]\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "ldr q17, [x26, x23]\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "ldp x27, x26, [%x[inptrs], #0x30]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q7, [SP, #0x0]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
- ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
- ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
- ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
- ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x70]\n"
- ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
- "ldr q3, [SP, #0x20]\n"
- ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x80]\n"
- ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
- "ldr q31, [SP, #0x40]\n"
- ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x90]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
- "ldr q6, [SP, #0x0]\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "ldr q26, [SP, #0x60]\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "movi v15.4s, #0x0\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
+ ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
+ ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
+ ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
+ ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x0]\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
+ ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x60]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0xa0]\n"
- ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x30]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "mov v17.16b, v15.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "ldr q30, [%x[params], #0x60]\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0xb0]\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
+ ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x20]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "movi v10.4s, #0x0\n"
- ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
- "ldr q29, [%x[params], #0xd0]\n"
- ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
- "ldr q2, [SP, #0x30]\n"
- ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
- "ldr q27, [%x[params], #0xe0]\n"
- ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
- "ldr q28, [SP, #0x50]\n"
- ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
- "ldr q25, [%x[params], #0xf0]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
- "ldr q8, [SP, #0x10]\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "ldr q24, [SP, #0x70]\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "movi v15.4s, #0x0\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
+ "ldr q3, [x9, x28]\n"
+ ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
+ "ldr q8, [x14, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0xc0]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
- "movi v10.4s, #0x0\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x100]\n"
- ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x90]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0xd0]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
+ ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x80]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
+ ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
+ "ldr q2, [x26, x28]\n"
+ ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
+ "ldr q7, [x13, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x120]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "mov v17.16b, v15.16b\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0xf0]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x130]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
+ ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x110]\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "ldr q30, [%x[params], #0xc0]\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "str s20, [x22, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
"ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
"ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
- "ldr q29, [%x[params], #0x130]\n"
- ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
- "ldr q27, [%x[params], #0x140]\n"
- ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
- "ldr q25, [%x[params], #0x150]\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "movi v15.4s, #0x0\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "movi v10.4s, #0x0\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x160]\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
+ "ldr q1, [x25, x28]\n"
+ ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
+ "ldr q6, [x12, x28]\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [x15, x28]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldr q29, [x15, x28]\n"
+ "ldr q28, [x14, x28]\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "mov v17.16b, v15.16b\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "ldr q27, [x13, x28]\n"
+ "ldr q26, [x12, x28]\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x170]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x150]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [x10, x28]\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldr q24, [x10, x28]\n"
+ "ldr q23, [x9, x28]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "ldr q22, [x26, x28]\n"
+ "ldr q21, [x25, x28]\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
"smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "ldr q21, [%x[params], #0x170]\n"
+ "str s5, [x24, x27]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "ldr q30, [%x[params], #0x120]\n"
+ "str s25, [x22, x27]\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "str s20, [x21, x27]\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "add x27, x27, #0x4\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x140]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
"add %x[params], %x[params], #0x180\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ "bgt 1b\n"
+ "2:" // Detached iteration
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
+ ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ "tst %x[n_channels], #0xf\n"
+ ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
+ ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
+ ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
+ ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x0]\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
+ ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x60]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x30]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
+ ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x20]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "mov v19.16b, v30.16b\n"
- "add x23, x23, #0x4\n"
- ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "mls v30.4s, v15.4s, v14.4s\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
- ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v16.16b, v20.16b, v21.16b\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
+ ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0xc0]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x90]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0xd0]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
+ ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0x80]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
+ ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
+ ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "ldr q9, [%x[params], #0x120]\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "ldr q31, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0xf0]\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x130]\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
+ ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "str s5, [x24, x27]\n"
+ "ldr q5, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
+ ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x22, x27]\n"
+ "mov v30.16b, v5.16b\n"
+ "str s20, [x21, x27]\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
+ ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x25, x23]\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x23]\n"
- "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str s5, [x24, x27]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s30, [x23, x27]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x22, x23]\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x21, x23]\n"
- "add x23, x23, #0x4\n"
- "bgt 1b\n"
- "tst %x[n_channels], #0xf\n"
- "beq 34f\n"
- "2:" // Oddments
- "and x19, %x[n_channels], #0xf\n"
- "add x13, x13, x23\n"
- "add x12, x12, x23\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "add x9, x9, x23\n"
- "add x28, x28, x23\n"
- "add x27, x27, x23\n"
- "add x26, x26, x23\n"
- "tbz %x[n_channels], #3, 6f\n"
- "ld1 { v27.d }[0], [x13], #0x8\n"
- "ld1 { v1.d }[0], [x12], #0x8\n"
- "ld1 { v25.d }[0], [x11], #0x8\n"
- "ld1 { v23.d }[0], [x10], #0x8\n"
- "ld1 { v31.d }[0], [x9], #0x8\n"
- "ld1 { v28.d }[0], [x28], #0x8\n"
- "ld1 { v21.d }[0], [x27], #0x8\n"
- "ld1 { v26.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 4f\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "add x27, x27, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d8, [x14], #0x8\n"
+ "ldr d7, [x13], #0x8\n"
+ "ldr d6, [x12], #0x8\n"
+ "ldr d4, [x10], #0x8\n"
+ "ldr d3, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d1, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v7.s }[2], [x13], #0x4\n"
+ "ld1 { v6.s }[2], [x12], #0x4\n"
+ "ld1 { v4.s }[2], [x10], #0x4\n"
+ "ld1 { v3.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.h }[6], [x15], #0x2\n"
+ "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v7.h }[6], [x13], #0x2\n"
+ "ld1 { v6.h }[6], [x12], #0x2\n"
+ "ld1 { v4.h }[6], [x10], #0x2\n"
+ "ld1 { v3.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[14], [x15], #0x1\n"
+ "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v7.b }[14], [x13], #0x1\n"
+ "ld1 { v6.b }[14], [x12], #0x1\n"
+ "ld1 { v4.b }[14], [x10], #0x1\n"
+ "ld1 { v3.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x25], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[12], [x15], #0x1\n"
+ "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v7.b }[12], [x13], #0x1\n"
+ "ld1 { v6.b }[12], [x12], #0x1\n"
+ "ld1 { v4.b }[12], [x10], #0x1\n"
+ "ld1 { v3.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x25], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.h }[4], [x15], #0x2\n"
+ "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v7.h }[4], [x13], #0x2\n"
+ "ld1 { v6.h }[4], [x12], #0x2\n"
+ "ld1 { v4.h }[4], [x10], #0x2\n"
+ "ld1 { v3.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[10], [x15], #0x1\n"
+ "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v7.b }[10], [x13], #0x1\n"
+ "ld1 { v6.b }[10], [x12], #0x1\n"
+ "ld1 { v4.b }[10], [x10], #0x1\n"
+ "ld1 { v3.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x25], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[8], [x15], #0x1\n"
+ "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v7.b }[8], [x13], #0x1\n"
+ "ld1 { v6.b }[8], [x12], #0x1\n"
+ "ld1 { v4.b }[8], [x10], #0x1\n"
+ "ld1 { v3.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x25], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s8, [x14], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
+ "ldr s6, [x12], #0x4\n"
+ "ldr s4, [x10], #0x4\n"
+ "ldr s3, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s1, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v9.h }[2], [x15], #0x2\n"
+ "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v7.h }[2], [x13], #0x2\n"
+ "ld1 { v6.h }[2], [x12], #0x2\n"
+ "ld1 { v4.h }[2], [x10], #0x2\n"
+ "ld1 { v3.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[6], [x15], #0x1\n"
+ "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v7.b }[6], [x13], #0x1\n"
+ "ld1 { v6.b }[6], [x12], #0x1\n"
+ "ld1 { v4.b }[6], [x10], #0x1\n"
+ "ld1 { v3.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x25], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[4], [x15], #0x1\n"
+ "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v7.b }[4], [x13], #0x1\n"
+ "ld1 { v6.b }[4], [x12], #0x1\n"
+ "ld1 { v4.b }[4], [x10], #0x1\n"
+ "ld1 { v3.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x25], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h9, [x15], #0x2\n"
+ "ldr h8, [x14], #0x2\n"
+ "ldr h7, [x13], #0x2\n"
+ "ldr h6, [x12], #0x2\n"
+ "ldr h4, [x10], #0x2\n"
+ "ldr h3, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h1, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.b }[2], [x15], #0x1\n"
+ "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v7.b }[2], [x13], #0x1\n"
+ "ld1 { v6.b }[2], [x12], #0x1\n"
+ "ld1 { v4.b }[2], [x10], #0x1\n"
+ "ld1 { v3.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x25], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b9, [x15], #0x1\n"
+ "ldr b8, [x14], #0x1\n"
+ "ldr b7, [x13], #0x1\n"
+ "ldr b6, [x12], #0x1\n"
+ "ldr b4, [x10], #0x1\n"
+ "ldr b3, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b1, [x25], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d29, [x15], #0x8\n"
+ "ldr d28, [x14], #0x8\n"
+ "ldr d27, [x13], #0x8\n"
+ "ldr d26, [x12], #0x8\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d22, [x26], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v29.s }[2], [x15], #0x4\n"
+ "ld1 { v28.s }[2], [x14], #0x4\n"
"ld1 { v27.s }[2], [x13], #0x4\n"
- "ld1 { v1.s }[2], [x12], #0x4\n"
- "ld1 { v25.s }[2], [x11], #0x4\n"
- "ld1 { v23.s }[2], [x10], #0x4\n"
- "ld1 { v31.s }[2], [x9], #0x4\n"
- "ld1 { v28.s }[2], [x28], #0x4\n"
- "ld1 { v21.s }[2], [x27], #0x4\n"
- "ld1 { v26.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 3f\n"
+ "ld1 { v26.s }[2], [x12], #0x4\n"
+ "ld1 { v24.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v22.s }[2], [x26], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v29.h }[6], [x15], #0x2\n"
+ "ld1 { v28.h }[6], [x14], #0x2\n"
"ld1 { v27.h }[6], [x13], #0x2\n"
- "ld1 { v1.h }[6], [x12], #0x2\n"
- "ld1 { v25.h }[6], [x11], #0x2\n"
- "ld1 { v23.h }[6], [x10], #0x2\n"
- "ld1 { v31.h }[6], [x9], #0x2\n"
- "ld1 { v28.h }[6], [x28], #0x2\n"
- "ld1 { v21.h }[6], [x27], #0x2\n"
- "ld1 { v26.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[6], [x12], #0x2\n"
+ "ld1 { v24.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v22.h }[6], [x26], #0x2\n"
+ "ld1 { v21.h }[6], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[14], [x15], #0x1\n"
+ "ld1 { v28.b }[14], [x14], #0x1\n"
"ld1 { v27.b }[14], [x13], #0x1\n"
- "ld1 { v1.b }[14], [x12], #0x1\n"
- "ld1 { v25.b }[14], [x11], #0x1\n"
- "ld1 { v23.b }[14], [x10], #0x1\n"
- "ld1 { v31.b }[14], [x9], #0x1\n"
- "ld1 { v28.b }[14], [x28], #0x1\n"
- "ld1 { v21.b }[14], [x27], #0x1\n"
- "ld1 { v26.b }[14], [x26], #0x1\n"
- "b 10f\n"
- "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[14], [x12], #0x1\n"
+ "ld1 { v24.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v22.b }[14], [x26], #0x1\n"
+ "ld1 { v21.b }[14], [x25], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[12], [x15], #0x1\n"
+ "ld1 { v28.b }[12], [x14], #0x1\n"
"ld1 { v27.b }[12], [x13], #0x1\n"
- "ld1 { v1.b }[12], [x12], #0x1\n"
- "ld1 { v25.b }[12], [x11], #0x1\n"
- "ld1 { v23.b }[12], [x10], #0x1\n"
- "ld1 { v31.b }[12], [x9], #0x1\n"
- "ld1 { v28.b }[12], [x28], #0x1\n"
- "ld1 { v21.b }[12], [x27], #0x1\n"
- "ld1 { v26.b }[12], [x26], #0x1\n"
- "b 10f\n"
- "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v26.b }[12], [x12], #0x1\n"
+ "ld1 { v24.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v22.b }[12], [x26], #0x1\n"
+ "ld1 { v21.b }[12], [x25], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v29.h }[4], [x15], #0x2\n"
+ "ld1 { v28.h }[4], [x14], #0x2\n"
"ld1 { v27.h }[4], [x13], #0x2\n"
- "ld1 { v1.h }[4], [x12], #0x2\n"
- "ld1 { v25.h }[4], [x11], #0x2\n"
- "ld1 { v23.h }[4], [x10], #0x2\n"
- "ld1 { v31.h }[4], [x9], #0x2\n"
- "ld1 { v28.h }[4], [x28], #0x2\n"
- "ld1 { v21.h }[4], [x27], #0x2\n"
- "ld1 { v26.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[4], [x12], #0x2\n"
+ "ld1 { v24.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v22.h }[4], [x26], #0x2\n"
+ "ld1 { v21.h }[4], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[10], [x15], #0x1\n"
+ "ld1 { v28.b }[10], [x14], #0x1\n"
"ld1 { v27.b }[10], [x13], #0x1\n"
- "ld1 { v1.b }[10], [x12], #0x1\n"
- "ld1 { v25.b }[10], [x11], #0x1\n"
- "ld1 { v23.b }[10], [x10], #0x1\n"
- "ld1 { v31.b }[10], [x9], #0x1\n"
- "ld1 { v28.b }[10], [x28], #0x1\n"
- "ld1 { v21.b }[10], [x27], #0x1\n"
- "ld1 { v26.b }[10], [x26], #0x1\n"
- "b 10f\n"
- "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[10], [x12], #0x1\n"
+ "ld1 { v24.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v22.b }[10], [x26], #0x1\n"
+ "ld1 { v21.b }[10], [x25], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[8], [x15], #0x1\n"
+ "ld1 { v28.b }[8], [x14], #0x1\n"
"ld1 { v27.b }[8], [x13], #0x1\n"
- "ld1 { v1.b }[8], [x12], #0x1\n"
- "ld1 { v25.b }[8], [x11], #0x1\n"
- "ld1 { v23.b }[8], [x10], #0x1\n"
- "ld1 { v31.b }[8], [x9], #0x1\n"
- "ld1 { v28.b }[8], [x28], #0x1\n"
- "ld1 { v21.b }[8], [x27], #0x1\n"
- "ld1 { v26.b }[8], [x26], #0x1\n"
- "b 10f\n"
- "6:" // Oddments: Load (A): Bit 3: Unset
- "tbz %x[n_channels], #2, 8f\n"
- "ld1 { v27.s }[0], [x13], #0x4\n"
- "ld1 { v1.s }[0], [x12], #0x4\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "ld1 { v23.s }[0], [x10], #0x4\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v28.s }[0], [x28], #0x4\n"
- "ld1 { v21.s }[0], [x27], #0x4\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v26.b }[8], [x12], #0x1\n"
+ "ld1 { v24.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v22.b }[8], [x26], #0x1\n"
+ "ld1 { v21.b }[8], [x25], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s29, [x15], #0x4\n"
+ "ldr s28, [x14], #0x4\n"
+ "ldr s27, [x13], #0x4\n"
+ "ldr s26, [x12], #0x4\n"
+ "ldr s24, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s22, [x26], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v29.h }[2], [x15], #0x2\n"
+ "ld1 { v28.h }[2], [x14], #0x2\n"
"ld1 { v27.h }[2], [x13], #0x2\n"
- "ld1 { v1.h }[2], [x12], #0x2\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "ld1 { v23.h }[2], [x10], #0x2\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v28.h }[2], [x28], #0x2\n"
- "ld1 { v21.h }[2], [x27], #0x2\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.h }[2], [x12], #0x2\n"
+ "ld1 { v24.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v22.h }[2], [x26], #0x2\n"
+ "ld1 { v21.h }[2], [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[6], [x15], #0x1\n"
+ "ld1 { v28.b }[6], [x14], #0x1\n"
"ld1 { v27.b }[6], [x13], #0x1\n"
- "ld1 { v1.b }[6], [x12], #0x1\n"
- "ld1 { v25.b }[6], [x11], #0x1\n"
- "ld1 { v23.b }[6], [x10], #0x1\n"
- "ld1 { v31.b }[6], [x9], #0x1\n"
- "ld1 { v28.b }[6], [x28], #0x1\n"
- "ld1 { v21.b }[6], [x27], #0x1\n"
- "ld1 { v26.b }[6], [x26], #0x1\n"
- "b 10f\n"
- "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[6], [x12], #0x1\n"
+ "ld1 { v24.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v22.b }[6], [x26], #0x1\n"
+ "ld1 { v21.b }[6], [x25], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[4], [x15], #0x1\n"
+ "ld1 { v28.b }[4], [x14], #0x1\n"
"ld1 { v27.b }[4], [x13], #0x1\n"
- "ld1 { v1.b }[4], [x12], #0x1\n"
- "ld1 { v25.b }[4], [x11], #0x1\n"
- "ld1 { v23.b }[4], [x10], #0x1\n"
- "ld1 { v31.b }[4], [x9], #0x1\n"
- "ld1 { v28.b }[4], [x28], #0x1\n"
- "ld1 { v21.b }[4], [x27], #0x1\n"
- "ld1 { v26.b }[4], [x26], #0x1\n"
- "b 10f\n"
- "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 9f\n"
- "ld1 { v27.h }[0], [x13], #0x2\n"
- "ld1 { v1.h }[0], [x12], #0x2\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "ld1 { v23.h }[0], [x10], #0x2\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v28.h }[0], [x28], #0x2\n"
- "ld1 { v21.h }[0], [x27], #0x2\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v26.b }[4], [x12], #0x1\n"
+ "ld1 { v24.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v22.b }[4], [x26], #0x1\n"
+ "ld1 { v21.b }[4], [x25], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h29, [x15], #0x2\n"
+ "ldr h28, [x14], #0x2\n"
+ "ldr h27, [x13], #0x2\n"
+ "ldr h26, [x12], #0x2\n"
+ "ldr h24, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h22, [x26], #0x2\n"
+ "ldr h21, [x25], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v29.b }[2], [x15], #0x1\n"
+ "ld1 { v28.b }[2], [x14], #0x1\n"
"ld1 { v27.b }[2], [x13], #0x1\n"
- "ld1 { v1.b }[2], [x12], #0x1\n"
- "ld1 { v25.b }[2], [x11], #0x1\n"
- "ld1 { v23.b }[2], [x10], #0x1\n"
- "ld1 { v31.b }[2], [x9], #0x1\n"
- "ld1 { v28.b }[2], [x28], #0x1\n"
- "ld1 { v21.b }[2], [x27], #0x1\n"
- "ld1 { v26.b }[2], [x26], #0x1\n"
- "b 10f\n"
- "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 10f\n"
- "ld1 { v27.b }[0], [x13], #0x1\n"
- "ld1 { v1.b }[0], [x12], #0x1\n"
- "ld1 { v25.b }[0], [x11], #0x1\n"
- "ld1 { v23.b }[0], [x10], #0x1\n"
- "ld1 { v31.b }[0], [x9], #0x1\n"
- "ld1 { v28.b }[0], [x28], #0x1\n"
- "ld1 { v21.b }[0], [x27], #0x1\n"
- "ld1 { v26.b }[0], [x26], #0x1\n"
- "10:" // Oddments: Load (A): Bit 3: End
- "ldp x13, x12, [%x[inptrs], #0x40]\n"
- "add x13, x13, x23\n"
- "ldp x11, x10, [%x[inptrs], #0x50]\n"
- "ldp x9, x28, [%x[inptrs], #0x60]\n"
- "add x12, x12, x23\n"
- "ldp x27, x26, [%x[inptrs], #0x70]\n"
- "add x11, x11, x23\n"
- "add x10, x10, x23\n"
- "add x9, x9, x23\n"
- "add x28, x28, x23\n"
- "add x27, x27, x23\n"
- "add x26, x26, x23\n"
- "tbz %x[n_channels], #3, 14f\n"
- "ld1 { v24.d }[0], [x13], #0x8\n"
- "ld1 { v22.d }[0], [x12], #0x8\n"
- "ld1 { v20.d }[0], [x11], #0x8\n"
- "ld1 { v16.d }[0], [x10], #0x8\n"
- "ld1 { v19.d }[0], [x9], #0x8\n"
- "ld1 { v0.d }[0], [x28], #0x8\n"
- "ld1 { v18.d }[0], [x27], #0x8\n"
- "ld1 { v17.d }[0], [x26], #0x8\n"
- "tbz %x[n_channels], #2, 12f\n"
- "ld1 { v24.s }[2], [x13], #0x4\n"
- "ld1 { v22.s }[2], [x12], #0x4\n"
- "ld1 { v20.s }[2], [x11], #0x4\n"
- "ld1 { v16.s }[2], [x10], #0x4\n"
- "ld1 { v19.s }[2], [x9], #0x4\n"
- "ld1 { v0.s }[2], [x28], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v17.s }[2], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 11f\n"
- "ld1 { v24.h }[6], [x13], #0x2\n"
- "ld1 { v22.h }[6], [x12], #0x2\n"
- "ld1 { v20.h }[6], [x11], #0x2\n"
- "ld1 { v16.h }[6], [x10], #0x2\n"
- "ld1 { v19.h }[6], [x9], #0x2\n"
- "ld1 { v0.h }[6], [x28], #0x2\n"
- "ld1 { v18.h }[6], [x27], #0x2\n"
- "ld1 { v17.h }[6], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[14], [x13], #0x1\n"
- "ld1 { v22.b }[14], [x12], #0x1\n"
- "ld1 { v20.b }[14], [x11], #0x1\n"
- "ld1 { v16.b }[14], [x10], #0x1\n"
- "ld1 { v19.b }[14], [x9], #0x1\n"
- "ld1 { v0.b }[14], [x28], #0x1\n"
- "ld1 { v18.b }[14], [x27], #0x1\n"
- "ld1 { v17.b }[14], [x26], #0x1\n"
- "b 18f\n"
- "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[12], [x13], #0x1\n"
- "ld1 { v22.b }[12], [x12], #0x1\n"
- "ld1 { v20.b }[12], [x11], #0x1\n"
- "ld1 { v16.b }[12], [x10], #0x1\n"
- "ld1 { v19.b }[12], [x9], #0x1\n"
- "ld1 { v0.b }[12], [x28], #0x1\n"
- "ld1 { v18.b }[12], [x27], #0x1\n"
- "ld1 { v17.b }[12], [x26], #0x1\n"
- "b 18f\n"
- "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
- "tbz %x[n_channels], #1, 13f\n"
- "ld1 { v24.h }[4], [x13], #0x2\n"
- "ld1 { v22.h }[4], [x12], #0x2\n"
- "ld1 { v20.h }[4], [x11], #0x2\n"
- "ld1 { v16.h }[4], [x10], #0x2\n"
- "ld1 { v19.h }[4], [x9], #0x2\n"
- "ld1 { v0.h }[4], [x28], #0x2\n"
- "ld1 { v18.h }[4], [x27], #0x2\n"
- "ld1 { v17.h }[4], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[10], [x13], #0x1\n"
- "ld1 { v22.b }[10], [x12], #0x1\n"
- "ld1 { v20.b }[10], [x11], #0x1\n"
- "ld1 { v16.b }[10], [x10], #0x1\n"
- "ld1 { v19.b }[10], [x9], #0x1\n"
- "ld1 { v0.b }[10], [x28], #0x1\n"
- "ld1 { v18.b }[10], [x27], #0x1\n"
- "ld1 { v17.b }[10], [x26], #0x1\n"
- "b 18f\n"
- "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[8], [x13], #0x1\n"
- "ld1 { v22.b }[8], [x12], #0x1\n"
- "ld1 { v20.b }[8], [x11], #0x1\n"
- "ld1 { v16.b }[8], [x10], #0x1\n"
- "ld1 { v19.b }[8], [x9], #0x1\n"
- "ld1 { v0.b }[8], [x28], #0x1\n"
- "ld1 { v18.b }[8], [x27], #0x1\n"
- "ld1 { v17.b }[8], [x26], #0x1\n"
- "b 18f\n"
- "14:" // Oddments: Load (B): Bit 3: Unset
- "tbz %x[n_channels], #2, 16f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "ld1 { v22.s }[0], [x12], #0x4\n"
- "ld1 { v20.s }[0], [x11], #0x4\n"
- "ld1 { v16.s }[0], [x10], #0x4\n"
- "ld1 { v19.s }[0], [x9], #0x4\n"
- "ld1 { v0.s }[0], [x28], #0x4\n"
- "ld1 { v18.s }[0], [x27], #0x4\n"
- "ld1 { v17.s }[0], [x26], #0x4\n"
- "tbz %x[n_channels], #1, 15f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "ld1 { v22.h }[2], [x12], #0x2\n"
- "ld1 { v20.h }[2], [x11], #0x2\n"
- "ld1 { v16.h }[2], [x10], #0x2\n"
- "ld1 { v19.h }[2], [x9], #0x2\n"
- "ld1 { v0.h }[2], [x28], #0x2\n"
- "ld1 { v18.h }[2], [x27], #0x2\n"
- "ld1 { v17.h }[2], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[6], [x13], #0x1\n"
- "ld1 { v22.b }[6], [x12], #0x1\n"
- "ld1 { v20.b }[6], [x11], #0x1\n"
- "ld1 { v16.b }[6], [x10], #0x1\n"
- "ld1 { v19.b }[6], [x9], #0x1\n"
- "ld1 { v0.b }[6], [x28], #0x1\n"
- "ld1 { v18.b }[6], [x27], #0x1\n"
- "ld1 { v17.b }[6], [x26], #0x1\n"
- "b 18f\n"
- "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[4], [x13], #0x1\n"
- "ld1 { v22.b }[4], [x12], #0x1\n"
- "ld1 { v20.b }[4], [x11], #0x1\n"
- "ld1 { v16.b }[4], [x10], #0x1\n"
- "ld1 { v19.b }[4], [x9], #0x1\n"
- "ld1 { v0.b }[4], [x28], #0x1\n"
- "ld1 { v18.b }[4], [x27], #0x1\n"
- "ld1 { v17.b }[4], [x26], #0x1\n"
- "b 18f\n"
- "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
- "tbz %x[n_channels], #1, 17f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "ld1 { v22.h }[0], [x12], #0x2\n"
- "ld1 { v20.h }[0], [x11], #0x2\n"
- "ld1 { v16.h }[0], [x10], #0x2\n"
- "ld1 { v19.h }[0], [x9], #0x2\n"
- "ld1 { v0.h }[0], [x28], #0x2\n"
- "ld1 { v18.h }[0], [x27], #0x2\n"
- "ld1 { v17.h }[0], [x26], #0x2\n"
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[2], [x13], #0x1\n"
- "ld1 { v22.b }[2], [x12], #0x1\n"
- "ld1 { v20.b }[2], [x11], #0x1\n"
- "ld1 { v16.b }[2], [x10], #0x1\n"
- "ld1 { v19.b }[2], [x9], #0x1\n"
- "ld1 { v0.b }[2], [x28], #0x1\n"
- "ld1 { v18.b }[2], [x27], #0x1\n"
- "ld1 { v17.b }[2], [x26], #0x1\n"
- "b 18f\n"
- "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "tbz %x[n_channels], #0, 18f\n"
- "ld1 { v24.b }[0], [x13], #0x1\n"
- "ld1 { v22.b }[0], [x12], #0x1\n"
- "ld1 { v20.b }[0], [x11], #0x1\n"
- "ld1 { v16.b }[0], [x10], #0x1\n"
- "ld1 { v19.b }[0], [x9], #0x1\n"
- "ld1 { v0.b }[0], [x28], #0x1\n"
- "ld1 { v18.b }[0], [x27], #0x1\n"
- "ld1 { v17.b }[0], [x26], #0x1\n"
- "18:" // Oddments: Load (B): Bit 3: End
- "zip1 v7.16b, v27.16b, v25.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "cmp x19, #0x4\n"
- "zip2 v5.16b, v27.16b, v25.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "zip1 v8.16b, v1.16b, v23.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "zip2 v3.16b, v1.16b, v23.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "zip1 v2.16b, v31.16b, v21.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "zip2 v4.16b, v31.16b, v21.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "ld1 { v26.b }[2], [x12], #0x1\n"
+ "ld1 { v24.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v22.b }[2], [x26], #0x1\n"
+ "ld1 { v21.b }[2], [x25], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b29, [x15], #0x1\n"
+ "ldr b28, [x14], #0x1\n"
+ "ldr b27, [x13], #0x1\n"
+ "ldr b26, [x12], #0x1\n"
+ "ldr b24, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b22, [x26], #0x1\n"
+ "ldr b21, [x25], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
+ "ldr q0, [%x[params], #0x10]\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "zip2 v30.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "zip1 v2.16b, v3.16b, v1.16b\n"
+ "zip2 v5.16b, v9.16b, v7.16b\n"
+ "cmp x20, #0x4\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v8.16b, v6.16b\n"
+ "zip2 v6.16b, v8.16b, v6.16b\n"
+ "zip2 v1.16b, v3.16b, v1.16b\n"
+ "zip2 v3.16b, v4.16b, v2.16b\n"
+ "zip1 v4.16b, v4.16b, v2.16b\n"
+ "zip2 v25.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v27.16b, v28.16b, v26.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n"
+ "zip2 v8.16b, v9.16b, v7.16b\n"
+ "zip1 v9.16b, v9.16b, v7.16b\n"
+ "zip1 v7.16b, v5.16b, v6.16b\n"
+ "zip2 v6.16b, v5.16b, v6.16b\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "zip2 v26.16b, v28.16b, v26.16b\n"
+ "zip2 v20.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v23.16b, v21.16b\n"
+ "zip2 v21.16b, v23.16b, v21.16b\n"
+ "zip2 v28.16b, v29.16b, v27.16b\n"
+ "zip1 v29.16b, v29.16b, v27.16b\n"
+ "zip1 v2.16b, v30.16b, v1.16b\n"
+ ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip1 v27.16b, v25.16b, v26.16b\n"
+ "zip2 v26.16b, v25.16b, v26.16b\n"
+ "zip2 v23.16b, v24.16b, v22.16b\n"
+ "zip1 v24.16b, v24.16b, v22.16b\n"
+ "zip1 v22.16b, v20.16b, v21.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n"
+ "zip2 v21.16b, v20.16b, v21.16b\n"
+ "mov v30.16b, v5.16b\n"
+ ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n"
+ ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n"
+ ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n"
+ ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n"
+ ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
"add %x[params], %x[params], #0x60\n"
- "zip1 v1.16b, v28.16b, v26.16b\n"
- "zip2 v31.16b, v28.16b, v26.16b\n"
- "zip1 v28.16b, v24.16b, v20.16b\n"
- "zip2 v26.16b, v24.16b, v20.16b\n"
- "zip1 v24.16b, v22.16b, v16.16b\n"
- "zip2 v22.16b, v22.16b, v16.16b\n"
- "zip1 v20.16b, v19.16b, v18.16b\n"
- "zip2 v19.16b, v19.16b, v18.16b\n"
- "zip1 v18.16b, v0.16b, v17.16b\n"
- "zip2 v17.16b, v0.16b, v17.16b\n"
- "zip1 v6.16b, v7.16b, v8.16b\n"
- "zip2 v8.16b, v7.16b, v8.16b\n"
- "zip1 v7.16b, v5.16b, v3.16b\n"
- "str q7, [SP, #0x0]\n"
- "zip2 v5.16b, v5.16b, v3.16b\n"
- "str q5, [SP, #0x10]\n"
- "zip1 v3.16b, v2.16b, v1.16b\n"
- "zip2 v2.16b, v2.16b, v1.16b\n"
- "zip1 v1.16b, v4.16b, v31.16b\n"
- "str q1, [SP, #0x20]\n"
- "zip2 v16.16b, v4.16b, v31.16b\n"
- "str q16, [SP, #0x30]\n"
- "zip1 v31.16b, v28.16b, v24.16b\n"
- "zip2 v28.16b, v28.16b, v24.16b\n"
- "zip1 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x40]\n"
- "zip2 v16.16b, v26.16b, v22.16b\n"
- "str q16, [SP, #0x50]\n"
- "zip1 v26.16b, v20.16b, v18.16b\n"
- "zip2 v24.16b, v20.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x60]\n"
- "zip2 v16.16b, v19.16b, v17.16b\n"
- "str q16, [SP, #0x70]\n"
- "mov v22.16b, v30.16b\n"
- "mov v20.16b, v30.16b\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
- "movi v15.4s, #0x0\n"
- ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
- ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
- ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
- ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
- "movi v10.4s, #0x0\n"
- ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
- ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
- ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v22.4s, v22.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 19f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 20f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 21f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 22f\n"
- "19:" // Oddments: Unroll 0: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 20f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 21f\n"
- "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "tbz x19, #0, 21f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
-
- "22:" // Oddments: Unroll 0: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q30, [%x[params], #0x0]\n"
- ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "cmp x19, #0x4\n"
- "movi v10.4s, #0x0\n"
- "ldr q27, [%x[params], #0x20]\n"
- "ldr q25, [%x[params], #0x30]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n"
+ ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n"
+ "movi v17.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "mov v19.16b, v30.16b\n"
- ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
- ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
- ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
- ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
- ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n"
"ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n"
+ ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n"
+ ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n"
+ ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n"
"ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
- ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
- ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n"
+ ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n"
+ ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n"
+ ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n"
+ ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 23f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 24f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 25f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 26f\n"
- "23:" // Oddments: Unroll 1: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 24f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 25f\n"
- "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "tbz x19, #0, 25f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
-
- "26:" // Oddments: Unroll 1: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q6, [SP, #0x0]\n"
- "movi v10.4s, #0x0\n"
- "ldr q3, [SP, #0x20]\n"
- "cmp x19, #0x4\n"
- ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
- "ldr q31, [SP, #0x40]\n"
- "ldr q26, [SP, #0x60]\n"
- ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
- "ldr q30, [%x[params], #0x0]\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n"
+ ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n"
+ "movi v17.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
- ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
- "ext v6.16b, v6.16b, v6.16b, #0x1\n"
- ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v31.16b, v31.16b, v31.16b, #0x1\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
- ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
- ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
- ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
- ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
- ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n"
+ ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n"
+ ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n"
+ ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n"
+ ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n"
+ ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n"
+ ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n"
+ ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "blt 27f\n"
- "str s30, [x25, x23]\n"
- "str s22, [x24, x23]\n"
- "str s20, [x22, x23]\n"
- "str s19, [x21, x23]\n"
+ "blt 28f\n"
+ "str s5, [x24, x27]\n"
+ "str s30, [x23, x27]\n"
+ "str s25, [x22, x27]\n"
+ "str s20, [x21, x27]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
"b 30f\n"
- "27:" // Oddments: Unroll 2: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 28f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 29f\n"
- "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "tbz x19, #0, 29f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
-
- "30:" // Oddments: Unroll 2: After oddment store
- "add x23, x23, #0x4\n"
- "subs x19, x19, #0x4\n"
- "ble 34f\n"
- "movi v15.4s, #0x0\n"
- "ldr q8, [SP, #0x10]\n"
- "movi v10.4s, #0x0\n"
- "ldr q2, [SP, #0x30]\n"
- "ldr q28, [SP, #0x50]\n"
- ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
- "ldr q24, [SP, #0x70]\n"
- "ldr q30, [%x[params], #0x0]\n"
- "mov v22.16b, v30.16b\n"
- "ldr q29, [%x[params], #0x10]\n"
- "mov v20.16b, v30.16b\n"
- "ldr q27, [%x[params], #0x20]\n"
- "mov v19.16b, v30.16b\n"
- "ldr q25, [%x[params], #0x30]\n"
- ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
- "ldr q23, [%x[params], #0x40]\n"
- "ldr q21, [%x[params], #0x50]\n"
- ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q5, [%x[params], #0x0]\n"
+ "ldr q0, [%x[params], #0x10]\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n"
+ "ldr q16, [%x[params], #0x20]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "mov v30.16b, v5.16b\n"
+ "mov v25.16b, v5.16b\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ "ldr q4, [%x[params], #0x50]\n"
+ "mov v20.16b, v5.16b\n"
+ ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n"
+ ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n"
+ "movi v17.4s, #0x0\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
- "mov v17.16b, v15.16b\n"
- ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
- ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
- "mls v30.4s, v15.4s, v14.4s\n"
- ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- "mls v20.4s, v17.4s, v14.4s\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
- ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
- ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
- "sqrdmulh v30.4s, v30.4s, v23.4s\n"
- ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
- ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
- ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
- "and v18.16b, v30.16b, v21.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
- ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
- "mov v17.16b, v10.16b\n"
- ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
- "mls v22.4s, v10.4s, v14.4s\n"
- ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "mls v19.4s, v17.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v23.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "and v16.16b, v20.16b, v21.16b\n"
+ ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n"
+ "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n"
+ ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n"
+ ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n"
+ ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n"
+ ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n"
+ "mls v5.4s, v19.4s, v11.4s\n"
+ ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n"
+ ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n"
+ ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n"
+ ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n"
+ "mls v30.4s, v17.4s, v11.4s\n"
+ "mls v25.4s, v18.4s, v11.4s\n"
+ "mls v20.4s, v16.4s, v11.4s\n"
+ "and v0.16b, v5.16b, v4.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "and v16.16b, v30.16b, v4.16b\n"
+ "and v31.16b, v25.16b, v4.16b\n"
+ "and v0.16b, v20.16b, v4.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v30.4s, v30.4s, v9.4s\n"
- "and v17.16b, v22.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "srshl v5.4s, v5.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "smax v5.4s, v5.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
"smin v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v19.4s, v19.4s, v23.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "and v16.16b, v19.16b, v21.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v21.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "srshl v22.4s, v22.4s, v21.4s\n"
- "add v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v9.4s\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
"smin v20.4s, v20.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v9.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "smax v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "31:" // Oddments: Unroll 3: Oddment store
- "add x25, x25, x23\n"
- "add x24, x24, x23\n"
- "add x22, x22, x23\n"
- "add x21, x21, x23\n"
- "tbz x19, #1, 32f\n"
- "st1 { v30.h }[0], [x25], #0x2\n"
- "st1 { v22.h }[0], [x24], #0x2\n"
- "st1 { v20.h }[0], [x22], #0x2\n"
- "st1 { v19.h }[0], [x21], #0x2\n"
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v20.b }[2], [x22], #0x1\n"
- "st1 { v19.b }[2], [x21], #0x1\n"
- "b 33f\n"
- "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "tbz x19, #0, 33f\n"
- "st1 { v30.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v20.b }[0], [x22], #0x1\n"
- "st1 { v19.b }[0], [x21], #0x1\n"
- "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
-
- "34:" // End
- "add SP, SP, #0x80\n"
+ "32:" // Oddments: Unroll 3: Oddment store
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v5.h }[0], [x24], #0x2\n"
+ "st1 { v30.h }[0], [x23], #0x2\n"
+ "st1 { v25.h }[0], [x22], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v5.b }[2], [x24], #0x1\n"
+ "st1 { v30.b }[2], [x23], #0x1\n"
+ "st1 { v25.b }[2], [x22], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v5.b }[0], [x24], #0x1\n"
+ "st1 { v30.b }[0], [x23], #0x1\n"
+ "st1 { v25.b }[0], [x22], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "35:" // End
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4b0fca77f1..15bbb31413 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,1072 +91,1072 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x7, x6, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v24.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v22.16b }, [x24]\n"
- "ld1r { v12.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v14.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v15.8h }, [x19]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v19.16b, v13.16b\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v11.16b, v26.16b\n"
- "mov v18.16b, v13.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v24.16b, v26.16b\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "mov x8, #0x0\n"
+ "mov x17, #0x0\n"
+ "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x12, x11, [x22, #0x0]\n"
+ "ldp x10, x9, [x22, #0x10]\n"
+ "cbz x7, 3f\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "subs x7, x7, #0x1\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
"mov v9.16b, v13.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v26.16b\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d31, [x24, x8]\n"
+ "ldr d30, [x23, x8]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d29, [x22, x8]\n"
+ "ldr d28, [x21, x8]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ldr d27, [x20, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
"beq 2f\n"
"1:" // Loop
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "add x17, x17, #0x48\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "subs x16, x16, #0x1\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x15, x15, #0x48\n"
+ "subs x7, x7, #0x1\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "add x11, x11, #0x20\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "add x17, x17, #0x8\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr d31, [x24, x8]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d30, [x23, x8]\n"
+ "ldr d29, [x22, x8]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "ldr d28, [x21, x8]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d27, [x20, x8]\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
"bgt 1b\n"
"2:" // Tail
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "tst x8, #0x7\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "tst x6, #0x7\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
- "add x11, x11, #0x20\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "add x17, x17, #0x8\n"
"beq 64f\n"
- "add x17, x17, #0x48\n"
+ "add x15, x15, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v13.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v26.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[2], [x19]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x6, #2, 5f\n"
+ "ld1 { v13.4s }, [x28], #0x10\n"
+ "tbz x6, #1, 4f\n"
+ "ld1 { v20.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[2], [x28]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[0], [x28]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "tbz x6, #1, 6f\n"
+ "ld1 { v13.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[2], [x28]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[0], [x28]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ldr x19, [x12, #0x20]\n"
- "usubl v4.8h, v4.8b, v12.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
- "usubl v6.8h, v6.8b, v12.8b\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
- "usubl v8.8h, v8.8b, v12.8b\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x23]\n"
- "ld1 { v30.b }[6], [x22]\n"
- "ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "add x24, x24, x8\n"
+ "add x23, x23, x8\n"
+ "add x22, x22, x8\n"
+ "add x21, x21, x8\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 9f\n"
+ "ld1 { v31.s }[0], [x24], #0x4\n"
+ "ld1 { v30.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 8f\n"
+ "ld1 { v31.h }[2], [x24], #0x2\n"
+ "ld1 { v30.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[6], [x24]\n"
+ "ld1 { v30.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x23]\n"
- "ld1 { v30.b }[4], [x22]\n"
- "ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[4], [x24]\n"
+ "ld1 { v30.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x23]\n"
- "ld1 { v30.b }[2], [x22]\n"
- "ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x6, #1, 10f\n"
+ "ld1 { v31.h }[0], [x24], #0x2\n"
+ "ld1 { v30.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[2], [x24]\n"
+ "ld1 { v30.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x23]\n"
- "ld1 { v30.b }[0], [x22]\n"
- "ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[0], [x24]\n"
+ "ld1 { v30.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "add x21, x21, x15\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "add x21, x21, x8\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "tbz x6, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x6, #1, 12f\n"
"ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[6], [x21]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[4], [x21]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x6, #1, 14f\n"
"ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[2], [x21]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr x20, [x12, #0x30]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x16, #0x30]\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "tbz x8, #2, 17f\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x8\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "tbz x6, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 16f\n"
+ "tbz x6, #1, 16f\n"
"ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 18f\n"
+ "tbz x6, #1, 18f\n"
"ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "add x26, x26, x15\n"
- "tbz x8, #2, 21f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x8, #1, 20f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "add x28, x28, x8\n"
+ "tbz x6, #2, 21f\n"
+ "ld1 { v28.s }[0], [x28], #0x4\n"
+ "tbz x6, #1, 20f\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[6], [x28]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[4], [x28]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x8, #1, 22f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x6, #1, 22f\n"
+ "ld1 { v28.h }[0], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[2], [x28]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[0], [x28]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x25, [x12, #0x40]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x27, [x16, #0x40]\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "add x25, x25, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v31.s }[0], [x25], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v31.h }[2], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[6], [x25]\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "add x27, x27, x8\n"
+ "tbz x6, #2, 25f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x6, #1, 24f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[4], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v31.h }[0], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[2], [x25]\n"
+ "tbz x6, #1, 26f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[0], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[0], [x27]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr x19, [x12, #0x48]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x26, [x16, #0x48]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 29f\n"
- "ld1 { v30.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 28f\n"
- "ld1 { v30.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[6], [x19]\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "add x26, x26, x8\n"
+ "tbz x6, #2, 29f\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "tbz x6, #1, 28f\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[4], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x8, #1, 30f\n"
- "ld1 { v30.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[2], [x19]\n"
+ "tbz x6, #1, 30f\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[0], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x24, [x12, #0x50]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "add x24, x24, x15\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "add x25, x25, x8\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "tbz x6, #2, 33f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x6, #1, 32f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x6, #1, 34f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x23, [x12, #0x58]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x24, [x16, #0x58]\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "add x24, x24, x8\n"
+ "tbz x6, #2, 37f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x6, #1, 36f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x6, #1, 38f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "add x23, x23, x8\n"
+ "tbz x6, #2, 41f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "tbz x6, #1, 40f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x6, #1, 42f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[0], [x23]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr x21, [x12, #0x68]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x22, [x16, #0x68]\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "add x22, x22, x8\n"
+ "tbz x6, #2, 45f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x6, #1, 44f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x6, #1, 46f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "add x21, x21, x8\n"
+ "tbz x6, #2, 49f\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "tbz x6, #1, 48f\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x6, #1, 50f\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x19, [x12, #0x78]\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v28.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v28.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[6], [x19]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x20, [x16, #0x78]\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 53f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 52f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[4], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v28.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[2], [x19]\n"
+ "tbz x6, #1, 54f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[0], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v21.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x11], #0x10\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v10.d }[0], [x13], #0x8\n"
- "ld1 { v16.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x11]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "tbz x6, #2, 57f\n"
+ "ld1 { v17.4s }, [x14], #0x10\n"
+ "ld1 { v22.4s }, [x13], #0x10\n"
+ "tbz x6, #1, 56f\n"
+ "ld1 { v23.d }[0], [x14], #0x8\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[2], [x14]\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[0], [x13]\n"
- "ld1 { v16.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[0], [x14]\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v21.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x11]\n"
+ "tbz x6, #1, 58f\n"
+ "ld1 { v17.d }[0], [x14], #0x8\n"
+ "ld1 { v22.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[2], [x14]\n"
+ "ld1 { v22.s }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[0], [x14]\n"
+ "ld1 { v22.s }[0], [x13]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "add x12, x12, x17\n"
+ "add x11, x11, x17\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "add x10, x10, x17\n"
+ "add x9, x9, x17\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "tbz x8, #2, 61f\n"
- "st1 { v13.s }[0], [x10], #0x4\n"
- "st1 { v19.s }[0], [x9], #0x4\n"
- "st1 { v18.s }[0], [x28], #0x4\n"
- "st1 { v9.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 60f\n"
- "st1 { v13.h }[2], [x10], #0x2\n"
- "st1 { v19.h }[2], [x9], #0x2\n"
- "st1 { v18.h }[2], [x28], #0x2\n"
- "st1 { v9.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[6], [x10], #0x1\n"
- "st1 { v19.b }[6], [x9], #0x1\n"
- "st1 { v18.b }[6], [x28], #0x1\n"
- "st1 { v9.b }[6], [x27], #0x1\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "tbz x6, #2, 61f\n"
+ "st1 { v13.s }[0], [x12], #0x4\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v16.s }[0], [x10], #0x4\n"
+ "st1 { v25.s }[0], [x9], #0x4\n"
+ "tbz x6, #1, 60f\n"
+ "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v16.h }[2], [x10], #0x2\n"
+ "st1 { v25.h }[2], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v16.b }[6], [x10], #0x1\n"
+ "st1 { v25.b }[6], [x9], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[4], [x10], #0x1\n"
- "st1 { v19.b }[4], [x9], #0x1\n"
- "st1 { v18.b }[4], [x28], #0x1\n"
- "st1 { v9.b }[4], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[4], [x12], #0x1\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v16.b }[4], [x10], #0x1\n"
+ "st1 { v25.b }[4], [x9], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "st1 { v13.h }[0], [x10], #0x2\n"
- "st1 { v19.h }[0], [x9], #0x2\n"
- "st1 { v18.h }[0], [x28], #0x2\n"
- "st1 { v9.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[2], [x10], #0x1\n"
- "st1 { v19.b }[2], [x9], #0x1\n"
- "st1 { v18.b }[2], [x28], #0x1\n"
- "st1 { v9.b }[2], [x27], #0x1\n"
+ "tbz x6, #1, 62f\n"
+ "st1 { v13.h }[0], [x12], #0x2\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v16.h }[0], [x10], #0x2\n"
+ "st1 { v25.h }[0], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v16.b }[2], [x10], #0x1\n"
+ "st1 { v25.b }[2], [x9], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[0], [x10], #0x1\n"
- "st1 { v19.b }[0], [x9], #0x1\n"
- "st1 { v18.b }[0], [x28], #0x1\n"
- "st1 { v9.b }[0], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[0], [x12], #0x1\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v16.b }[0], [x10], #0x1\n"
+ "st1 { v25.b }[0], [x9], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 0216786c6f..de072a7d55 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,75 +100,75 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.16b }, [x24]\n"
- "ld1r { v13.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v14.8h }, [x19]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v9.16b, v15.16b\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v16.16b, v10.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v21.16b, v10.16b\n"
- "mov v23.16b, v15.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v18.16b, v10.16b\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
"usubl v0.8h, v0.8b, v13.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"usubl v1.8h, v1.8b, v13.8b\n"
"usubl v2.8h, v2.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"usubl v3.8h, v3.8b, v13.8b\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"usubl v5.8h, v5.8b, v13.8b\n"
"usubl v6.8h, v6.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
"usubl v7.8h, v7.8b, v13.8b\n"
"usubl v8.8h, v8.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d31, [x27, x17]\n"
+ "ldr d30, [x26, x17]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr d28, [x24, x17]\n"
"usubl v31.8h, v31.8b, v12.8b\n"
"usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d27, [x23, x17]\n"
+ "ldr d26, [x22, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr d24, [x20, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
@@ -176,250 +176,250 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x14, x14, #0x48\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "add x17, x17, #0x48\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "subs x16, x16, #0x1\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ "subs x8, x8, #0x1\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "add x11, x11, #0x20\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v1.8h, v1.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"usubl v2.8h, v2.8b, v13.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d31, [x27, x17]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
"usubl v5.8h, v5.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d30, [x26, x17]\n"
+ "ldr d29, [x25, x17]\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"usubl v7.8h, v7.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr d28, [x24, x17]\n"
+ "ldr d27, [x23, x17]\n"
"usubl v8.8h, v8.8b, v13.8b\n"
"usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr d25, [x21, x17]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d24, [x20, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
@@ -428,966 +428,966 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "tst x7, #0x7\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "tst x8, #0x7\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x11, x11, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 88f\n"
- "add x17, x17, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v15.4s }, [x24], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v17.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[0], [x24]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v15.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[2], [x24]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[0], [x24]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x26, x25, [x12, #0x0]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"usubl v2.8h, v2.8b, v13.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
- "ldp x24, x23, [x12, #0x10]\n"
- "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
"usubl v5.8h, v5.8b, v13.8b\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"usubl v7.8h, v7.8b, v13.8b\n"
"usubl v8.8h, v8.8b, v13.8b\n"
- "add x26, x26, x15\n"
- "add x25, x25, x15\n"
- "add x24, x24, x15\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "ld1 { v30.s }[0], [x25], #0x4\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x26]\n"
- "ld1 { v30.b }[6], [x25]\n"
- "ld1 { v29.b }[6], [x24]\n"
- "ld1 { v28.b }[6], [x23]\n"
- "ld1 { v27.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x26]\n"
- "ld1 { v30.b }[4], [x25]\n"
- "ld1 { v29.b }[4], [x24]\n"
- "ld1 { v28.b }[4], [x23]\n"
- "ld1 { v27.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "ld1 { v30.h }[0], [x25], #0x2\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x26]\n"
- "ld1 { v30.b }[2], [x25]\n"
- "ld1 { v29.b }[2], [x24]\n"
- "ld1 { v28.b }[2], [x23]\n"
- "ld1 { v27.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x26]\n"
- "ld1 { v30.b }[0], [x25]\n"
- "ld1 { v29.b }[0], [x24]\n"
- "ld1 { v28.b }[0], [x23]\n"
- "ld1 { v27.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v25.b }[0], [x20]\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"usubl v31.8h, v31.8b, v12.8b\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "add x24, x24, x15\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "add x24, x24, x17\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "tbz x7, #2, 13f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x7, #1, 12f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x7, #1, 14f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 17f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 16f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x8, #1, 18f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x12, #0x50]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 21f\n"
+ "ldr x21, [x15, #0x50]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 20f\n"
+ "tbz x7, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x8, #1, 22f\n"
+ "tbz x7, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x19, [x12, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v26.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v26.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[6], [x19]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[4], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v26.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[2], [x19]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[0], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 29f\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 28f\n"
+ "tbz x7, #1, 28f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 30f\n"
+ "tbz x7, #1, 30f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0x70]\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
"usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x12, #0x78]\n"
+ "ldr x23, [x15, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x12, #0x80]\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 45f\n"
"ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
+ "tbz x7, #1, 44f\n"
"ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
+ "tbz x7, #1, 46f\n"
"ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x20, [x12, #0x88]\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[6], [x23]\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "add x24, x24, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v25.s }[0], [x24], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[6], [x24]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[4], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[4], [x24]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[2], [x23]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v25.h }[0], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[2], [x24]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[0], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[0], [x24]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x24, [x12, #0x98]\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "add x24, x24, x15\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "add x25, x25, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 61f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 60f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x7, #1, 62f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x22, [x12, #0xa8]\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 65f\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 64f\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[6], [x22]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[6], [x23]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[4], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[4], [x23]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 66f\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[2], [x22]\n"
+ "tbz x7, #1, 66f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[2], [x23]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[0], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[0], [x23]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
"usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x12, #0xb0]\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 69f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 68f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x8, #1, 70f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x7, #1, 70f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 73f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 72f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[6], [x21]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[4], [x21]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x8, #1, 74f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x7, #1, 74f\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[2], [x21]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[0], [x21]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0xc0]\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 77f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 76f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x8, #1, 78f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 78f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "tbz x8, #2, 81f\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "tbz x7, #2, 81f\n"
"ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v0.4s }, [x11], #0x10\n"
- "tbz x8, #1, 80f\n"
- "ld1 { v4.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x11]\n"
+ "ld1 { v18.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v30.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x11]\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 82f\n"
+ "tbz x7, #1, 82f\n"
"ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v0.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
+ "ld1 { v18.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[2], [x13]\n"
- "ld1 { v0.s }[2], [x11]\n"
+ "ld1 { v18.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 83f\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[0], [x13]\n"
- "ld1 { v0.s }[0], [x11]\n"
+ "ld1 { v18.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
"sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x8, #2, 85f\n"
- "st1 { v15.s }[0], [x10], #0x4\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v15.s }[0], [x11], #0x4\n"
+ "st1 { v10.s }[0], [x10], #0x4\n"
"st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v22.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 84f\n"
- "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v21.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "st1 { v10.h }[2], [x10], #0x2\n"
"st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v22.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v21.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[6], [x11], #0x1\n"
+ "st1 { v10.b }[6], [x10], #0x1\n"
"st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v22.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "st1 { v21.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[4], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[4], [x11], #0x1\n"
+ "st1 { v10.b }[4], [x10], #0x1\n"
"st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v22.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "st1 { v21.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 86f\n"
- "st1 { v15.h }[0], [x10], #0x2\n"
+ "tbz x7, #1, 86f\n"
+ "st1 { v15.h }[0], [x11], #0x2\n"
+ "st1 { v10.h }[0], [x10], #0x2\n"
"st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v22.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v21.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[2], [x11], #0x1\n"
+ "st1 { v10.b }[2], [x10], #0x1\n"
"st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v22.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "st1 { v21.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[0], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[0], [x11], #0x1\n"
+ "st1 { v10.b }[0], [x10], #0x1\n"
"st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v22.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "st1 { v21.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index bd6fa1d443..2fe688a65e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,2073 +111,2073 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
- "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
- "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
- "add x14, x10, %[offsetof_Requantize32_minval]\n"
- "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
- "add x5, x10, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v9.16b }, [x17]\n"
- "ld1r { v14.16b }, [x9]\n"
- "lsr x3, x0, #0x3\n"
- "ld1r { v18.8h }, [x4]\n"
- "ld1r { v11.8h }, [x14]\n"
- "mov x24, #0x0\n"
- "mov x22, #0x0\n"
- "ld1r { v13.8h }, [x5]\n"
- "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x20, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x16, x8, [x25, #0x0]\n"
- "ldp x4, x7, [x25, #0x10]\n"
- "cbz x3, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x3, x3, #0x1\n"
- "mov v17.16b, v15.16b\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v8.16b, v16.16b\n"
- "mov v10.16b, v15.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v7.16b, v16.16b\n"
- "mov v6.16b, v15.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v5.16b, v16.16b\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v9.16b }, [x3]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
+ "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x11]\n"
+ "ld1r { v14.8h }, [x5]\n"
+ "add x3, x13, %[offsetof_Requantize32_minval]\n"
+ "add x15, x13, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x3]\n"
+ "ld1r { v11.8h }, [x15]\n"
+ "mov x0, #0x0\n"
+ "mov x10, #0x0\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x6, [x24, #0x0]\n"
+ "ldp x7, x16, [x24, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d31, [x9, x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d30, [x28, x0]\n"
+ "ldr d29, [x27, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d28, [x26, x0]\n"
+ "ldr d27, [x25, x0]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d23, [x24, x0]\n"
+ "ldr d25, [x23, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d24, [x22, x0]\n"
+ "ldr d26, [x21, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
+ "ldr d22, [x20, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr q12, [x10, #0x0]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q19, [x1, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q29, [x1, #0x10]\n"
- "subs x3, x3, #0x1\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "add x10, x10, #0x20\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x14, [x20, #0x110]\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "add x23, x23, #0xc8\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "add x3, x3, #0xc8\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x22, x22, #0x8\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "add x10, x10, #0x8\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d31, [x9, x0]\n"
+ "ldr d30, [x28, x0]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d29, [x27, x0]\n"
+ "ldr d28, [x26, x0]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d27, [x25, x0]\n"
+ "ldr d23, [x24, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d25, [x23, x0]\n"
+ "ldr d24, [x22, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d26, [x21, x0]\n"
+ "ldr d22, [x20, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q12, [x10, #0x0]\n"
- "ldr q19, [x1, #0x0]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q20, [x10, #0x10]\n"
- "ldr q29, [x1, #0x10]\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "tst x0, #0x7\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x10, x10, #0x20\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "tst x1, #0x7\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "add x22, x22, #0x8\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "add x10, x10, #0x8\n"
"beq 124f\n"
- "add x23, x23, #0xc8\n"
+ "add x3, x3, #0xc8\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x0, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x0, #1, 4f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v13.4s }, [x13], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x0, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x1, #1, 6f\n"
+ "ld1 { v13.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[2], [x13]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[0], [x13]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d2, [x23, #0x10]\n"
- "ldr d3, [x23, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldr d4, [x23, #0x20]\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "add x28, x28, x24\n"
- "add x6, x6, x24\n"
- "add x26, x26, x24\n"
- "add x25, x25, x24\n"
- "add x5, x5, x24\n"
- "add x2, x2, x24\n"
- "add x27, x27, x24\n"
- "add x21, x21, x24\n"
- "add x12, x12, x24\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 9f\n"
- "ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x6], #0x4\n"
- "ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x25], #0x4\n"
- "ld1 { v27.s }[0], [x5], #0x4\n"
- "ld1 { v23.s }[0], [x2], #0x4\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x12], #0x4\n"
- "ld1 { v22.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 8f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x6], #0x2\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x5], #0x2\n"
- "ld1 { v23.h }[2], [x2], #0x2\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v22.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x6]\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x5]\n"
- "ld1 { v23.b }[6], [x2]\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x12]\n"
- "ld1 { v22.b }[6], [x19]\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "add x9, x9, x0\n"
+ "add x28, x28, x0\n"
+ "add x27, x27, x0\n"
+ "add x26, x26, x0\n"
+ "add x25, x25, x0\n"
+ "add x24, x24, x0\n"
+ "add x23, x23, x0\n"
+ "add x22, x22, x0\n"
+ "add x21, x21, x0\n"
+ "add x20, x20, x0\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x27], #0x4\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x6]\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x5]\n"
- "ld1 { v23.b }[4], [x2]\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x12]\n"
- "ld1 { v22.b }[4], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x0, #1, 10f\n"
- "ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x6], #0x2\n"
- "ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x25], #0x2\n"
- "ld1 { v27.h }[0], [x5], #0x2\n"
- "ld1 { v23.h }[0], [x2], #0x2\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x12], #0x2\n"
- "ld1 { v22.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x6]\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x5]\n"
- "ld1 { v23.b }[2], [x2]\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x12]\n"
- "ld1 { v22.b }[2], [x19]\n"
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x27], #0x2\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x6]\n"
- "ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x25]\n"
- "ld1 { v27.b }[0], [x5]\n"
- "ld1 { v23.b }[0], [x2]\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x12]\n"
- "ld1 { v22.b }[0], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x27]\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v22.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x19, [x20, #0x50]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x4, #0x50]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "add x19, x19, x24\n"
+ "add x20, x20, x0\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "tbz x0, #2, 13f\n"
- "ld1 { v31.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 12f\n"
- "ld1 { v31.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[6], [x19]\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[4], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x0, #1, 14f\n"
- "ld1 { v31.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[2], [x19]\n"
+ "tbz x1, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[0], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x58]\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "add x15, x15, x24\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "add x22, x22, x0\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "tbz x0, #2, 17f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 16f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x0, #1, 18f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 18f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x19, [x20, #0x60]\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 21f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 20f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x0, #1, 22f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d0, [x3, #0x28]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d0, [x23, #0x28]\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "ldr x27, [x20, #0x68]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
"smlal2 v7.4s, v22.8h, v0.8h\n"
- "tbz x0, #2, 25f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 24f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[6], [x27]\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[4], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x0, #1, 26f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[2], [x27]\n"
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[0], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x23, #0x30]\n"
+ "ldr d1, [x3, #0x30]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldr x5, [x20, #0x70]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
"smlal2 v7.4s, v25.8h, v1.8h\n"
- "tbz x0, #2, 29f\n"
- "ld1 { v24.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 28f\n"
- "ld1 { v24.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[6], [x5]\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[4], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x0, #1, 30f\n"
- "ld1 { v24.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[2], [x5]\n"
+ "tbz x1, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[0], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x23, #0x38]\n"
+ "ldr d2, [x3, #0x38]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
"smlal2 v7.4s, v24.8h, v2.8h\n"
- "tbz x0, #2, 33f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 32f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x0, #1, 34f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "tbz x1, #1, 34f\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[0], [x25]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x23, #0x40]\n"
+ "ldr d3, [x3, #0x40]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "ldr x12, [x20, #0x80]\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
"smlal2 v7.4s, v27.8h, v3.8h\n"
- "tbz x0, #2, 37f\n"
- "ld1 { v23.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 36f\n"
- "ld1 { v23.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[6], [x12]\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[6], [x23]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[4], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[4], [x23]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x0, #1, 38f\n"
- "ld1 { v23.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[2], [x12]\n"
+ "tbz x1, #1, 38f\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[2], [x23]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[0], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[0], [x23]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x23, #0x48]\n"
+ "ldr d4, [x3, #0x48]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x26, [x20, #0x88]\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x24, [x4, #0x88]\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "add x24, x24, x0\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "tbz x0, #2, 41f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 40f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x0, #1, 42f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x1, #1, 42f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x23, #0x50]\n"
+ "ldr d0, [x3, #0x50]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "ldr x14, [x20, #0x90]\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "tbz x0, #2, 45f\n"
- "ld1 { v31.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 44f\n"
- "ld1 { v31.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[6], [x14]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x15]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[4], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x15]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x0, #1, 46f\n"
- "ld1 { v31.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[2], [x14]\n"
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x15]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[0], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x15]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
"smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x15, x15, x24\n"
- "tbz x0, #2, 49f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 48f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x0, #1, 50f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x23, #0x58]\n"
+ "ldr d1, [x3, #0x58]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldr x21, [x20, #0xa0]\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x0\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
"smlal2 v7.4s, v30.8h, v1.8h\n"
- "tbz x0, #2, 53f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 52f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x0, #1, 54f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x1, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x23, #0x60]\n"
+ "ldr d2, [x3, #0x60]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "ldr x2, [x20, #0xa8]\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "add x2, x2, x24\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "add x13, x13, x0\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
"smlal2 v7.4s, v26.8h, v2.8h\n"
- "tbz x0, #2, 57f\n"
- "ld1 { v25.s }[0], [x2], #0x4\n"
- "tbz x0, #1, 56f\n"
- "ld1 { v25.h }[2], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[6], [x2]\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v25.s }[0], [x13], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v25.h }[2], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[6], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[4], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[4], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x0, #1, 58f\n"
- "ld1 { v25.h }[0], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[2], [x2]\n"
+ "tbz x1, #1, 58f\n"
+ "ld1 { v25.h }[0], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[0], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[0], [x13]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x23, #0x68]\n"
+ "ldr d3, [x3, #0x68]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "ldr x13, [x20, #0xb0]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x13, x13, x24\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x12, x12, x0\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 61f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x0, #1, 60f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v24.s }[0], [x12], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v24.h }[2], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[6], [x12]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[4], [x12]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x0, #1, 62f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x1, #1, 62f\n"
+ "ld1 { v24.h }[0], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[2], [x12]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[0], [x12]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x23, #0x70]\n"
+ "ldr d4, [x3, #0x70]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x9, x9, x24\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 65f\n"
- "ld1 { v22.s }[0], [x9], #0x4\n"
- "tbz x0, #1, 64f\n"
- "ld1 { v22.h }[2], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[6], [x9]\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[4], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x0, #1, 66f\n"
- "ld1 { v22.h }[0], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[2], [x9]\n"
+ "tbz x1, #1, 66f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[0], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x23, #0x78]\n"
+ "ldr d0, [x3, #0x78]\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "ldr x19, [x20, #0xc0]\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "tbz x0, #2, 69f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 68f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "add x11, x11, x0\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x0, #1, 70f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 70f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x28, [x20, #0xc8]\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
"smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x28, x28, x24\n"
- "tbz x0, #2, 73f\n"
- "ld1 { v23.s }[0], [x28], #0x4\n"
- "tbz x0, #1, 72f\n"
- "ld1 { v23.h }[2], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[6], [x28]\n"
+ "add x22, x22, x0\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v23.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[6], [x22]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[4], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[4], [x22]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x0, #1, 74f\n"
- "ld1 { v23.h }[0], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[2], [x28]\n"
+ "tbz x1, #1, 74f\n"
+ "ld1 { v23.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[2], [x22]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[0], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[0], [x22]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x23, #0x80]\n"
+ "ldr d1, [x3, #0x80]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldr x6, [x20, #0xd0]\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "add x6, x6, x24\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "add x9, x9, x0\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
"smlal2 v7.4s, v23.8h, v1.8h\n"
- "tbz x0, #2, 77f\n"
- "ld1 { v31.s }[0], [x6], #0x4\n"
- "tbz x0, #1, 76f\n"
- "ld1 { v31.h }[2], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[6], [x6]\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[6], [x9]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[4], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[4], [x9]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x0, #1, 78f\n"
- "ld1 { v31.h }[0], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[2], [x6]\n"
+ "tbz x1, #1, 78f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[2], [x9]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[0], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[0], [x9]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x23, #0x88]\n"
+ "ldr d2, [x3, #0x88]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x0\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
"smlal2 v7.4s, v31.8h, v2.8h\n"
- "tbz x0, #2, 81f\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 80f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x0, #1, 82f\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "tbz x1, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x23, #0x90]\n"
+ "ldr d3, [x3, #0x90]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "ldr x11, [x20, #0xe0]\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "add x27, x27, x0\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
"smlal2 v7.4s, v30.8h, v3.8h\n"
- "tbz x0, #2, 85f\n"
- "ld1 { v28.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 84f\n"
- "ld1 { v28.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[6], [x11]\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[6], [x27]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[4], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[4], [x27]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x0, #1, 86f\n"
- "ld1 { v28.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[2], [x11]\n"
+ "tbz x1, #1, 86f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[2], [x27]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[0], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[0], [x27]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x23, #0x98]\n"
+ "ldr d4, [x3, #0x98]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "add x17, x17, x24\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "tbz x0, #2, 89f\n"
- "ld1 { v26.s }[0], [x17], #0x4\n"
- "tbz x0, #1, 88f\n"
- "ld1 { v26.h }[2], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[6], [x17]\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[6], [x26]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[4], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[4], [x26]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x0, #1, 90f\n"
- "ld1 { v26.h }[0], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[2], [x17]\n"
+ "tbz x1, #1, 90f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[2], [x26]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[0], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[0], [x26]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x23, #0xa0]\n"
+ "ldr d0, [x3, #0xa0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "usubl v0.8h, v0.8b, v14.8b\n"
- "ldr x5, [x20, #0xf0]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "tbz x0, #2, 93f\n"
- "ld1 { v25.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 92f\n"
- "ld1 { v25.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[6], [x5]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[6], [x25]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[4], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[4], [x25]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x0, #1, 94f\n"
- "ld1 { v25.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[2], [x5]\n"
+ "tbz x1, #1, 94f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[2], [x25]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[0], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[0], [x25]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x25, x25, x24\n"
- "tbz x0, #2, 97f\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "tbz x0, #1, 96f\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[6], [x25]\n"
+ "add x24, x24, x0\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[6], [x24]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[4], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[4], [x24]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x0, #1, 98f\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "tbz x1, #1, 98f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[2], [x24]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[0], [x24]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x23, #0xa8]\n"
+ "ldr d1, [x3, #0xa8]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v14.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "tbz x0, #2, 101f\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 100f\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[6], [x26]\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[4], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x0, #1, 102f\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[2], [x26]\n"
+ "tbz x1, #1, 102f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[0], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x23, #0xb0]\n"
+ "ldr d2, [x3, #0xb0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "usubl v2.8h, v2.8b, v14.8b\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "tbz x0, #2, 105f\n"
- "ld1 { v25.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 104f\n"
- "ld1 { v25.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[6], [x12]\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v25.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v25.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[6], [x15]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[4], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[4], [x15]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x0, #1, 106f\n"
- "ld1 { v25.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[2], [x12]\n"
+ "tbz x1, #1, 106f\n"
+ "ld1 { v25.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[2], [x15]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[0], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[0], [x15]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x23, #0xb8]\n"
+ "ldr d3, [x3, #0xb8]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v3.8h, v3.8b, v14.8b\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x21, x21, x0\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 109f\n"
- "ld1 { v24.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 108f\n"
- "ld1 { v24.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[6], [x14]\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[4], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x0, #1, 110f\n"
- "ld1 { v24.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[2], [x14]\n"
+ "tbz x1, #1, 110f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[0], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x23, #0xc0]\n"
+ "ldr d4, [x3, #0xc0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 113f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 112f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x0, #1, 114f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "tbz x1, #1, 114f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x0, #2, 117f\n"
- "ld1 { v12.4s }, [x10], #0x10\n"
- "ld1 { v19.4s }, [x1], #0x10\n"
- "tbz x0, #1, 116f\n"
- "ld1 { v20.d }[0], [x10], #0x8\n"
- "ld1 { v29.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[2], [x10]\n"
- "ld1 { v29.s }[2], [x1]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v18.4s }, [x5], #0x10\n"
+ "ld1 { v6.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v5.d }[0], [x5], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[2], [x5]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[0], [x10]\n"
- "ld1 { v29.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[0], [x5]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x0, #1, 118f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
- "ld1 { v19.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[2], [x10]\n"
- "ld1 { v19.s }[2], [x1]\n"
+ "tbz x1, #1, 118f\n"
+ "ld1 { v18.d }[0], [x5], #0x8\n"
+ "ld1 { v6.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[2], [x5]\n"
+ "ld1 { v6.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[0], [x10]\n"
- "ld1 { v19.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[0], [x5]\n"
+ "ld1 { v6.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "add x16, x16, x22\n"
- "add x8, x8, x22\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "add x4, x4, x22\n"
- "add x7, x7, x22\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "add x17, x17, x10\n"
+ "add x6, x6, x10\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x7, x7, x10\n"
+ "add x16, x16, x10\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x0, #2, 121f\n"
- "st1 { v15.s }[0], [x16], #0x4\n"
- "st1 { v17.s }[0], [x8], #0x4\n"
- "st1 { v10.s }[0], [x4], #0x4\n"
- "st1 { v6.s }[0], [x7], #0x4\n"
- "tbz x0, #1, 120f\n"
- "st1 { v15.h }[2], [x16], #0x2\n"
- "st1 { v17.h }[2], [x8], #0x2\n"
- "st1 { v10.h }[2], [x4], #0x2\n"
- "st1 { v6.h }[2], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[6], [x16], #0x1\n"
- "st1 { v17.b }[6], [x8], #0x1\n"
- "st1 { v10.b }[6], [x4], #0x1\n"
- "st1 { v6.b }[6], [x7], #0x1\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v13.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x6], #0x4\n"
+ "st1 { v8.s }[0], [x7], #0x4\n"
+ "st1 { v17.s }[0], [x16], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v13.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x6], #0x2\n"
+ "st1 { v8.h }[2], [x7], #0x2\n"
+ "st1 { v17.h }[2], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x6], #0x1\n"
+ "st1 { v8.b }[6], [x7], #0x1\n"
+ "st1 { v17.b }[6], [x16], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[4], [x16], #0x1\n"
- "st1 { v17.b }[4], [x8], #0x1\n"
- "st1 { v10.b }[4], [x4], #0x1\n"
- "st1 { v6.b }[4], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x6], #0x1\n"
+ "st1 { v8.b }[4], [x7], #0x1\n"
+ "st1 { v17.b }[4], [x16], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x0, #1, 122f\n"
- "st1 { v15.h }[0], [x16], #0x2\n"
- "st1 { v17.h }[0], [x8], #0x2\n"
- "st1 { v10.h }[0], [x4], #0x2\n"
- "st1 { v6.h }[0], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[2], [x16], #0x1\n"
- "st1 { v17.b }[2], [x8], #0x1\n"
- "st1 { v10.b }[2], [x4], #0x1\n"
- "st1 { v6.b }[2], [x7], #0x1\n"
+ "tbz x1, #1, 122f\n"
+ "st1 { v13.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x6], #0x2\n"
+ "st1 { v8.h }[0], [x7], #0x2\n"
+ "st1 { v17.h }[0], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x6], #0x1\n"
+ "st1 { v8.b }[2], [x7], #0x1\n"
+ "st1 { v17.b }[2], [x16], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[0], [x16], #0x1\n"
- "st1 { v17.b }[0], [x8], #0x1\n"
- "st1 { v10.b }[0], [x4], #0x1\n"
- "st1 { v6.b }[0], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x6], #0x1\n"
+ "st1 { v8.b }[0], [x7], #0x1\n"
+ "st1 { v17.b }[0], [x16], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 9cebfe8f03..39001aa1fd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,583 +41,577 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v12.4s }, [x19]\n"
+ "lsr x12, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ld1r { v10.16b }, [x19]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v9.16b }, [x20]\n"
- "ld1r { v8.4s }, [x19]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v7.4s }, [x20]\n"
- "ld1r { v6.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "ld1r { v5.4s }, [x19]\n"
- "lsr x10, %x[n_channels], #0x2\n"
- "cbz x10, 6f\n"
+ "cbz x12, 6f\n"
"1:" // Channel loop
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q27, [%x[bias], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, %x[n_points], #0x1\n"
- "mov v24.16b, v27.16b\n"
- "ldr s4, [x9, x11]\n"
- "mov v23.16b, v27.16b\n"
- "mov v22.16b, v27.16b\n"
- "ldr s3, [x28, x11]\n"
- "mov v21.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
- "ldr s2, [x27, x11]\n"
- "mov v19.16b, v27.16b\n"
- "usubl v16.8h, v16.8b, v9.8b\n"
- "ldr s1, [x26, x11]\n"
- "usubl v4.8h, v4.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldr s16, [x28, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x27, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x26, x11]\n"
+ "ldr s19, [x25, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr s20, [x24, x11]\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldr s4, [x9, x11]\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "ldr s3, [x28, x11]\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr s2, [x27, x11]\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
- "usubl v4.8h, v4.8b, v10.8b\n"
- "ldr s1, [x26, x11]\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "usubl v16.8h, v16.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x28, x11]\n"
+ "ldr s17, [x27, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldr s18, [x26, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x25, x11]\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x20, x20, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x24, x11]\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 5f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q6, [%x[rq_mul_ptr], x19]\n"
- "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 5f\n"
- "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
"5:" // Channel loop: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "and v16.16b, v27.16b, v5.16b\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x27, x11]\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x26, x11]\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x25, x11]\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x24, x11]\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x23, x11]\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x22, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x21, x11]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x20, x11]\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x19, x11]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x10, LSL #2\n"
+ "cmp x11, x12, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 24f\n"
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 9f\n"
- "add x19, %x[bias], x11, LSL #2\n"
+ "add x20, %x[bias], x11, LSL #2\n"
"tbz %x[n_channels], #1, 7f\n"
- "ld1 { v27.d }[0], [x19], #0x8\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[2], [x19], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"b 8f\n"
"7:" // Oddments: Load bias: Bit 1: Unset
- "tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"8:" // Oddments: Load bias: Bit 1: End
-
"9:" // Oddments: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr x22, [x21], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v24.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v23.16b, v27.16b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "mov v22.16b, v27.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
"add x28, x28, x11\n"
- "mov v21.16b, v27.16b\n"
- "ldp x23, x22, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
"add x27, x27, x11\n"
- "mov v19.16b, v27.16b\n"
- "ldr x21, [x20], #0x8\n"
- "usubl v16.8h, v16.8b, v9.8b\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 11f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
- "usubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, %x[n_points], #0x1\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x22, [x21], #0x8\n"
+ "add x10, x10, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
"add x9, x9, x11\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "ldp x25, x24, [x20], #0x10\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
"add x28, x28, x11\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x23, x22, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
"add x27, x27, x11\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr x21, [x20], #0x8\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
"add x26, x26, x11\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
"add x25, x25, x11\n"
- "usubl v16.8h, v16.8b, v9.8b\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 14f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
- "usubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, x19, #0x1\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"bgt 12b\n"
"15:" // Oddments: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 21f\n"
- "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v6.d }[0], [x21], #0x8\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
"cbz %x[rq_left_shift_ptr], 16f\n"
- "ld1 { v7.d }[0], [x19], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[2], [x21], #0x4\n"
- "ld1 { v5.s }[2], [x20], #0x4\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 17f\n"
- "ld1 { v7.s }[2], [x19], #0x4\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
"17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
"b 20f\n"
"18:" // Oddments: Load quantisation parameters: Bit 1: Unset
- "tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[0], [x21], #0x4\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 19f\n"
- "ld1 { v7.s }[0], [x19], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
"20:" // Oddments: Load quantisation parameters: Bit 1: End
"21:" // Oddments: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
"add x26, x26, x11\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
- "and v16.16b, v27.16b, v5.16b\n"
"add x24, x24, x11\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
"add x22, x22, x11\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
"add x20, x20, x11\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "add x19, x19, x11\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v24.4s, v24.4s, v11.4s\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_channels], #1, 22f\n"
- "st1 { v27.h }[0], [x27], #0x2\n"
- "st1 { v26.h }[0], [x26], #0x2\n"
- "st1 { v25.h }[0], [x25], #0x2\n"
- "st1 { v24.h }[0], [x24], #0x2\n"
- "st1 { v23.h }[0], [x23], #0x2\n"
- "st1 { v22.h }[0], [x22], #0x2\n"
- "st1 { v21.h }[0], [x21], #0x2\n"
- "st1 { v20.h }[0], [x20], #0x2\n"
- "st1 { v19.h }[0], [x19], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[2], [x27], #0x1\n"
- "st1 { v26.b }[2], [x26], #0x1\n"
- "st1 { v25.b }[2], [x25], #0x1\n"
- "st1 { v24.b }[2], [x24], #0x1\n"
- "st1 { v23.b }[2], [x23], #0x1\n"
- "st1 { v22.b }[2], [x22], #0x1\n"
- "st1 { v21.b }[2], [x21], #0x1\n"
- "st1 { v20.b }[2], [x20], #0x1\n"
- "st1 { v19.b }[2], [x19], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: Store: Bit 1: Unset
- "tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[0], [x27], #0x1\n"
- "st1 { v26.b }[0], [x26], #0x1\n"
- "st1 { v25.b }[0], [x25], #0x1\n"
- "st1 { v24.b }[0], [x24], #0x1\n"
- "st1 { v23.b }[0], [x23], #0x1\n"
- "st1 { v22.b }[0], [x22], #0x1\n"
- "st1 { v21.b }[0], [x21], #0x1\n"
- "st1 { v20.b }[0], [x20], #0x1\n"
- "st1 { v19.b }[0], [x19], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
"24:" // End
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 057b1ef492..a6dba90f9e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,487 +40,475 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "movi v5.16b, #0x1\n"
- "ldr x22, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "ushr v5.4s, v5.4s, #0x8\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v15.16b, #0x1\n"
+ "ushr v15.4s, v15.4s, #0x8\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "ldr q7, [%x[params], #0x30]\n"
"movi v26.4s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "mov x11, #0x0\n"
- "movi v1.4s, #0x0\n"
- "ld1 { v15.16b }, [x22]\n"
- "mov x10, #0x0\n"
- "movi v22.4s, #0x0\n"
- "ld1 { v29.16b }, [x20]\n"
- "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "movi v25.4s, #0x0\n"
- "ld1 { v0.16b }, [x19]\n"
- "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
- "movi v13.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "mov v29.16b, v1.16b\n"
+ "mov v16.16b, v1.16b\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v22.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v30.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "mov v23.16b, v4.16b\n"
+ "mov v21.16b, v4.16b\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "mov v20.16b, v15.16b\n"
- "ldr x19, [%x[inptrs], #0x20]\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "mov v20.16b, v4.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x2\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x4\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov v25.16b, v0.16b\n"
+ "mov v19.16b, v0.16b\n"
"cmp %x[n_channels], #0x4\n"
- "ext v20.16b, v20.16b, v20.16b, #0x2\n"
- "ld1r { v4.4s }, [x21]\n"
- "mov v17.16b, v15.16b\n"
- "ld1 { v2.16b }, [x20]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "ld1 { v7.16b }, [x19]\n"
- "mov v23.16b, v15.16b\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext v23.16b, v23.16b, v23.16b, #0x6\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "mov v18.16b, v29.16b\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "zip1 v15.4s, v15.4s, v17.4s\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x2\n"
- "ld1r { v14.4s }, [x9]\n"
- "zip1 v20.4s, v20.4s, v23.4s\n"
- "ld1r { v27.4s }, [x28]\n"
- "zip1 v15.4s, v15.4s, v20.4s\n"
- "ld1r { v23.4s }, [x27]\n"
- "mov v17.16b, v29.16b\n"
- "ldr q6, [%x[params], #0x0]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "ldr q8, [%x[params], #0x10]\n"
- "mov v11.16b, v29.16b\n"
- "ldr q9, [%x[params], #0x20]\n"
- "ext v11.16b, v11.16b, v11.16b, #0x6\n"
- "ldr q10, [%x[params], #0x30]\n"
+ "mov x9, #0x0\n"
+ "mov v18.16b, v0.16b\n"
+ "mov v24.16b, v3.16b\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v3.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x2\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "zip1 v29.4s, v29.4s, v17.4s\n"
- "mov v12.16b, v0.16b\n"
- "ext v12.16b, v12.16b, v12.16b, #0x2\n"
- "zip1 v18.4s, v18.4s, v11.4s\n"
- "zip1 v29.4s, v29.4s, v18.4s\n"
- "mov v17.16b, v0.16b\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "mov v11.16b, v0.16b\n"
- "ext v11.16b, v11.16b, v11.16b, #0x6\n"
- "mov v18.16b, v2.16b\n"
- "zip1 v0.4s, v0.4s, v17.4s\n"
- "ext v18.16b, v18.16b, v18.16b, #0x2\n"
- "zip1 v12.4s, v12.4s, v11.4s\n"
- "zip1 v0.4s, v0.4s, v12.4s\n"
- "mov v17.16b, v2.16b\n"
+ "zip1 v1.4s, v1.4s, v16.4s\n"
+ "mov v16.16b, v3.16b\n"
+ "zip1 v29.4s, v29.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v31.4s\n"
+ "zip1 v22.4s, v22.4s, v30.4s\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x2\n"
"ext v17.16b, v17.16b, v17.16b, #0x4\n"
- "mov v19.16b, v2.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x6\n"
- "mov v28.16b, v7.16b\n"
- "zip1 v2.4s, v2.4s, v17.4s\n"
- "ext v28.16b, v28.16b, v28.16b, #0x2\n"
- "zip1 v18.4s, v18.4s, v19.4s\n"
- "zip1 v2.4s, v2.4s, v18.4s\n"
- "mov v18.16b, v7.16b\n"
- "ext v18.16b, v18.16b, v18.16b, #0x4\n"
- "mov v21.16b, v7.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x6\n"
- "movi v30.4s, #0x0\n"
- "zip1 v7.4s, v7.4s, v18.4s\n"
- "movi v3.4s, #0x0\n"
- "zip1 v28.4s, v28.4s, v21.4s\n"
- "zip1 v7.4s, v7.4s, v28.4s\n"
- "movi v12.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v21.4s\n"
+ "zip1 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v0.4s, v0.4s, v19.4s\n"
+ "zip1 v25.4s, v25.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v29.4s\n"
+ "zip1 v2.4s, v2.4s, v22.4s\n"
+ ".inst 0x6f81e1fa // udot v26.4s, v15.16b, v1.4b[0]\n"
+ "zip1 v3.4s, v3.4s, v17.4s\n"
+ "zip1 v24.4s, v24.4s, v16.4s\n"
+ ".inst 0x6fa1e1fb // udot v27.4s, v15.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v23.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6f81e9f7 // udot v23.4s, v15.16b, v1.4b[2]\n"
+ "movi v22.4s, #0x0\n"
"movi v21.4s, #0x0\n"
+ ".inst 0x6fa1e9f6 // udot v22.4s, v15.16b, v1.4b[3]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x6f82e1f5 // udot v21.4s, v15.16b, v2.4b[0]\n"
+ "movi v8.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6fa2e1f4 // udot v20.4s, v15.16b, v2.4b[1]\n"
+ "movi v18.4s, #0x0\n"
"movi v17.4s, #0x0\n"
+ ".inst 0x6f82e9e9 // udot v9.4s, v15.16b, v2.4b[2]\n"
"movi v16.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
+ "zip1 v0.4s, v0.4s, v25.4s\n"
+ ".inst 0x6fa2e9e8 // udot v8.4s, v15.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v24.4s\n"
+ ".inst 0x6f84e1f3 // udot v19.4s, v15.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e1f2 // udot v18.4s, v15.16b, v4.4b[1]\n"
+ ".inst 0x6f84e9f1 // udot v17.4s, v15.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e9f0 // udot v16.4s, v15.16b, v4.4b[3]\n"
"movi v31.4s, #0x0\n"
- ".inst 0x6f8fe0ba // udot v26.4s, v5.16b, v15.4b[0]\n"
- ".inst 0x6fafe0a1 // udot v1.4s, v5.16b, v15.4b[1]\n"
- ".inst 0x6f8fe8b6 // udot v22.4s, v5.16b, v15.4b[2]\n"
- ".inst 0x6fafe8b9 // udot v25.4s, v5.16b, v15.4b[3]\n"
- ".inst 0x6f9de0ad // udot v13.4s, v5.16b, v29.4b[0]\n"
- ".inst 0x6fbde0be // udot v30.4s, v5.16b, v29.4b[1]\n"
- ".inst 0x6f9de8a3 // udot v3.4s, v5.16b, v29.4b[2]\n"
- ".inst 0x6fbde8ac // udot v12.4s, v5.16b, v29.4b[3]\n"
- ".inst 0x6f80e0ab // udot v11.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0b3 // udot v19.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8b5 // udot v21.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f82e0b0 // udot v16.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0bc // udot v28.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x6f82e8b2 // udot v18.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8b4 // udot v20.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x6f87e0b8 // udot v24.4s, v5.16b, v7.4b[0]\n"
- ".inst 0x6fa7e0bf // udot v31.4s, v5.16b, v7.4b[1]\n"
- "mov v26.16b, v26.16b\n"
- "mov v1.16b, v1.16b\n"
- "mov v22.16b, v22.16b\n"
- "mov v25.16b, v25.16b\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "movi v13.4s, #0x0\n"
- ".inst 0x6f87e8ad // udot v13.4s, v5.16b, v7.4b[2]\n"
- "add v1.4s, v1.4s, v30.4s\n"
"movi v30.4s, #0x0\n"
- ".inst 0x6fa7e8be // udot v30.4s, v5.16b, v7.4b[3]\n"
- "add v22.4s, v22.4s, v3.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v11.4s\n"
- "add v1.4s, v1.4s, v19.4s\n"
- "add v22.4s, v22.4s, v21.4s\n"
- "add v25.4s, v25.4s, v17.4s\n"
- "mov v11.16b, v11.16b\n"
- "mov v3.16b, v19.16b\n"
- "mov v19.16b, v21.16b\n"
- "mov v21.16b, v17.16b\n"
- "add v11.4s, v11.4s, v16.4s\n"
- "add v3.4s, v3.4s, v28.4s\n"
- "add v19.4s, v19.4s, v18.4s\n"
- "add v21.4s, v21.4s, v20.4s\n"
- "add v11.4s, v11.4s, v24.4s\n"
- "add v3.4s, v3.4s, v31.4s\n"
- "add v19.4s, v19.4s, v13.4s\n"
- "add v21.4s, v21.4s, v30.4s\n"
- "neg v4.4s, v4.4s\n"
- "mul v26.4s, v26.4s, v4.4s\n"
- "str q26, [SP, #0x0]\n"
- "mul v1.4s, v1.4s, v4.4s\n"
- "mul v22.4s, v22.4s, v4.4s\n"
- "str q1, [SP, #0x10]\n"
- "mul v25.4s, v25.4s, v4.4s\n"
- "mul v11.4s, v11.4s, v4.4s\n"
- "str q22, [SP, #0x20]\n"
- "mul v3.4s, v3.4s, v4.4s\n"
- "str q25, [SP, #0x30]\n"
- "mul v19.4s, v19.4s, v4.4s\n"
- "mul v21.4s, v21.4s, v4.4s\n"
- "str q11, [SP, #0x40]\n"
- "add v26.4s, v26.4s, v6.4s\n"
- "str q3, [SP, #0x50]\n"
- "add v1.4s, v1.4s, v6.4s\n"
- "str q19, [SP, #0x60]\n"
- "add v22.4s, v22.4s, v6.4s\n"
- "add v25.4s, v25.4s, v6.4s\n"
- "str q21, [SP, #0x70]\n"
- "add v11.4s, v11.4s, v6.4s\n"
- "add v3.4s, v3.4s, v6.4s\n"
- "add v19.4s, v19.4s, v6.4s\n"
- "add v21.4s, v21.4s, v6.4s\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x6f80e1ff // udot v31.4s, v15.16b, v0.4b[0]\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x6fa0e1fe // udot v30.4s, v15.16b, v0.4b[1]\n"
+ ".inst 0x6f80e9fd // udot v29.4s, v15.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e9fc // udot v28.4s, v15.16b, v0.4b[3]\n"
+ "add v24.4s, v26.4s, v21.4s\n"
+ "add v25.4s, v27.4s, v20.4s\n"
+ "add v26.4s, v23.4s, v9.4s\n"
+ "add v27.4s, v22.4s, v8.4s\n"
+ "add v23.4s, v19.4s, v21.4s\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6f83e1f6 // udot v22.4s, v15.16b, v3.4b[0]\n"
+ "add v21.4s, v18.4s, v20.4s\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x6fa3e1f4 // udot v20.4s, v15.16b, v3.4b[1]\n"
+ "add v19.4s, v17.4s, v9.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6f83e9f2 // udot v18.4s, v15.16b, v3.4b[2]\n"
+ "add v17.4s, v16.4s, v8.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6fa3e9f0 // udot v16.4s, v15.16b, v3.4b[3]\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "add v28.4s, v23.4s, v22.4s\n"
+ "add v29.4s, v21.4s, v20.4s\n"
+ "add v30.4s, v19.4s, v18.4s\n"
+ "add v31.4s, v17.4s, v16.4s\n"
+ "neg v13.4s, v13.4s\n"
+ "mul v24.4s, v24.4s, v13.4s\n"
+ "mul v25.4s, v25.4s, v13.4s\n"
+ "mul v26.4s, v26.4s, v13.4s\n"
+ "mul v27.4s, v27.4s, v13.4s\n"
+ "mul v28.4s, v28.4s, v13.4s\n"
+ "mul v29.4s, v29.4s, v13.4s\n"
+ "mul v30.4s, v30.4s, v13.4s\n"
+ "mul v31.4s, v31.4s, v13.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"ble 2f\n"
"1:" // Loop
- ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n"
- "ldr q20, [%x[params], #0x0]\n"
- "add x11, x11, #0x10\n"
- ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n"
- "ldr q4, [%x[params], #0x10]\n"
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
+ "ldr q14, [%x[params], #0x20]\n"
+ ".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n"
- "ldr q6, [%x[params], #0x20]\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n"
- ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n"
- "ldr q8, [%x[params], #0x30]\n"
- ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n"
- ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n"
- ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n"
- ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n"
- ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n"
- "ldr q9, [%x[params], #0x40]\n"
- ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n"
- ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n"
- "ldr q10, [%x[params], #0x50]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
+ "ldr q5, [%x[params], #0x30]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [%x[params], #0x50]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
"add %x[params], %x[params], #0x60\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v1.4s, v1.4s, v20.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "sqrdmulh v11.4s, v11.4s, v20.4s\n"
- "and v30.16b, v26.16b, v4.16b\n"
- "and v17.16b, v1.16b, v4.16b\n"
- "and v16.16b, v22.16b, v4.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v30.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v25.16b, v4.16b\n"
- "srshl v26.4s, v26.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v1.4s, v1.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v23.4s\n"
- "smin v1.4s, v1.4s, v23.4s\n"
- "smin v22.4s, v22.4s, v23.4s\n"
- "smax v26.4s, v26.4s, v27.4s\n"
- "smax v1.4s, v1.4s, v27.4s\n"
- "smax v22.4s, v22.4s, v27.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x26, x10]\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q26, [SP, #0x0]\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "str s1, [x25, x10]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "ldr q1, [SP, #0x10]\n"
- "and v16.16b, v11.16b, v4.16b\n"
- "str s22, [x24, x10]\n"
- "sqrdmulh v3.4s, v3.4s, v20.4s\n"
- "ldr q22, [SP, #0x20]\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v20.4s\n"
- "and v17.16b, v3.16b, v4.16b\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v23.4s\n"
- "and v16.16b, v19.16b, v4.16b\n"
- "srshl v11.4s, v11.4s, v4.4s\n"
- "smax v25.4s, v25.4s, v27.4s\n"
- "sqadd v3.4s, v3.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "smin v24.4s, v24.4s, v10.4s\n"
+ "smin v25.4s, v25.4s, v10.4s\n"
+ "smin v26.4s, v26.4s, v10.4s\n"
+ "smin v27.4s, v27.4s, v10.4s\n"
+ "smin v28.4s, v28.4s, v10.4s\n"
+ "smin v29.4s, v29.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v10.4s\n"
+ "smin v31.4s, v31.4s, v10.4s\n"
+ "smax v24.4s, v24.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v11.4s\n"
+ "smax v27.4s, v27.4s, v11.4s\n"
+ "smax v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v11.4s\n"
+ "smax v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v11.4s, v11.4s, v14.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x23, x10]\n"
- "smin v11.4s, v11.4s, v23.4s\n"
- "srshl v3.4s, v3.4s, v4.4s\n"
- "ldr q25, [SP, #0x30]\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v11.4s, v11.4s, v27.4s\n"
- "add v3.4s, v3.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v4.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "smin v3.4s, v3.4s, v23.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str s11, [x22, x10]\n"
- "smax v3.4s, v3.4s, v27.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "ldr q11, [SP, #0x40]\n"
- "and v16.16b, v21.16b, v4.16b\n"
- "add v26.4s, v26.4s, v6.4s\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "smin v19.4s, v19.4s, v23.4s\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "str s3, [x21, x10]\n"
- "smax v19.4s, v19.4s, v27.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr q3, [SP, #0x50]\n"
- "add v1.4s, v1.4s, v6.4s\n"
- "add v22.4s, v22.4s, v6.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x20, x10]\n"
- "add v25.4s, v25.4s, v6.4s\n"
- "add v11.4s, v11.4s, v6.4s\n"
- "ldr q19, [SP, #0x60]\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "add v3.4s, v3.4s, v6.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v19.4s, v19.4s, v6.4s\n"
- "smin v21.4s, v21.4s, v23.4s\n"
- "smax v21.4s, v21.4s, v27.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x19, x10]\n"
- "add x10, x10, #0x4\n"
- "ldr q21, [SP, #0x70]\n"
- "add v21.4s, v21.4s, v6.4s\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"bgt 1b\n"
"2:" // Tail
- ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n"
- "ldr q20, [%x[params], #0x0]\n"
- "add x26, x26, x10\n"
- ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n"
- "ldr q4, [%x[params], #0x10]\n"
- "add x25, x25, x10\n"
- ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n"
- "add x24, x24, x10\n"
- ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n"
- "add x23, x23, x10\n"
- ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n"
- "add x22, x22, x10\n"
- ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n"
- "add x21, x21, x10\n"
- ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
- "add x20, x20, x10\n"
- ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n"
- "add x19, x19, x10\n"
- ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n"
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n"
+ "add x27, x27, x28\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
+ "add x20, x20, x28\n"
"add %x[params], %x[params], #0x20\n"
- ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n"
- ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n"
- ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n"
- ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n"
- ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v1.4s, v1.4s, v20.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "and v30.16b, v26.16b, v4.16b\n"
- "and v17.16b, v1.16b, v4.16b\n"
- "and v16.16b, v22.16b, v4.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v30.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v25.16b, v4.16b\n"
- "srshl v26.4s, v26.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v4.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v1.4s, v1.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v23.4s\n"
- "smin v1.4s, v1.4s, v23.4s\n"
- "smin v22.4s, v22.4s, v23.4s\n"
- "smax v26.4s, v26.4s, v27.4s\n"
- "smax v1.4s, v1.4s, v27.4s\n"
- "smax v22.4s, v22.4s, v27.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v1.16b, v1.16b, v1.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v11.4s, v11.4s, v20.4s\n"
- "sqrdmulh v3.4s, v3.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v20.4s\n"
- "and v16.16b, v11.16b, v4.16b\n"
- "and v17.16b, v3.16b, v4.16b\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v23.4s\n"
- "sqadd v11.4s, v11.4s, v16.4s\n"
- "sqadd v3.4s, v3.4s, v17.4s\n"
- "smax v25.4s, v25.4s, v27.4s\n"
- "and v16.16b, v19.16b, v4.16b\n"
- "srshl v11.4s, v11.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "smin v24.4s, v24.4s, v10.4s\n"
+ "smin v25.4s, v25.4s, v10.4s\n"
+ "smin v26.4s, v26.4s, v10.4s\n"
+ "smin v27.4s, v27.4s, v10.4s\n"
+ "smin v28.4s, v28.4s, v10.4s\n"
+ "smin v29.4s, v29.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v10.4s\n"
+ "smin v31.4s, v31.4s, v10.4s\n"
+ "smax v24.4s, v24.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v11.4s\n"
+ "smax v27.4s, v27.4s, v11.4s\n"
+ "smax v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v11.4s\n"
+ "smax v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "srshl v3.4s, v3.4s, v4.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v11.4s, v11.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v3.4s, v3.4s, v14.4s\n"
- "smin v11.4s, v11.4s, v23.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v3.4s, v3.4s, v23.4s\n"
- "smax v11.4s, v11.4s, v27.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v3.4s, v3.4s, v27.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "srshl v19.4s, v19.4s, v4.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "and v16.16b, v21.16b, v4.16b\n"
- "uzp1 v3.16b, v3.16b, v3.16b\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v19.4s, v19.4s, v23.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v27.4s\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "smin v21.4s, v21.4s, v23.4s\n"
- "smax v21.4s, v21.4s, v27.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"blt 3f\n"
- "str s26, [x26, #0x0]\n"
- "str s1, [x25, #0x0]\n"
- "str s22, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s11, [x22, #0x0]\n"
- "str s3, [x21, #0x0]\n"
- "str s19, [x20, #0x0]\n"
- "str s21, [x19, #0x0]\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
"b 4f\n"
"3:" // Tail: Oddments
- "st1 { v26.b }[0], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[0], [x25], #0x1\n"
- "st1 { v22.b }[0], [x24], #0x1\n"
- "st1 { v25.b }[0], [x23], #0x1\n"
- "st1 { v11.b }[0], [x22], #0x1\n"
- "st1 { v3.b }[0], [x21], #0x1\n"
- "st1 { v19.b }[0], [x20], #0x1\n"
- "st1 { v21.b }[0], [x19], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[1], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[1], [x25], #0x1\n"
- "st1 { v22.b }[1], [x24], #0x1\n"
- "st1 { v25.b }[1], [x23], #0x1\n"
- "st1 { v11.b }[1], [x22], #0x1\n"
- "st1 { v3.b }[1], [x21], #0x1\n"
- "st1 { v19.b }[1], [x20], #0x1\n"
- "st1 { v21.b }[1], [x19], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[2], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[2], [x25], #0x1\n"
- "st1 { v22.b }[2], [x24], #0x1\n"
- "st1 { v25.b }[2], [x23], #0x1\n"
- "st1 { v11.b }[2], [x22], #0x1\n"
- "st1 { v3.b }[2], [x21], #0x1\n"
- "st1 { v19.b }[2], [x20], #0x1\n"
- "st1 { v21.b }[2], [x19], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v26.b }[3], [x26], #0x1\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v1.b }[3], [x25], #0x1\n"
- "st1 { v22.b }[3], [x24], #0x1\n"
- "st1 { v25.b }[3], [x23], #0x1\n"
- "st1 { v11.b }[3], [x22], #0x1\n"
- "st1 { v3.b }[3], [x21], #0x1\n"
- "st1 { v19.b }[3], [x20], #0x1\n"
- "st1 { v21.b }[3], [x19], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
"4:" // Tail: End
- "add SP, SP, #0x80\n"
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 40242e9718..027cc9e5a2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,622 +40,596 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "movi v15.16b, #0x1\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "add SP, SP, #-0x80\n"
- "movi v14.4s, #0x1\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "movi v28.4s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "mov x11, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ld1 { v13.16b }, [x21]\n"
- "mov x10, #0x0\n"
- "movi v26.4s, #0x0\n"
- "ld1 { v12.16b }, [x20]\n"
- "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "movi v25.4s, #0x0\n"
- "ld1 { v7.16b }, [x19]\n"
- "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "movi v28.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "movi v31.4s, #0x0\n"
"movi v24.4s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "mov v18.16b, v13.16b\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "movi v30.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
"ldr x20, [%x[inptrs], #0x20]\n"
- "cmp %x[n_channels], #0x4\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "mov v17.16b, v12.16b\n"
- "ld1 { v6.16b }, [x21]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ld1 { v5.16b }, [x20]\n"
- "mov v16.16b, v7.16b\n"
- "ld1 { v4.16b }, [x19]\n"
+ "mov v16.16b, v3.16b\n"
"ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "zip1 v13.2d, v13.2d, v18.2d\n"
- "ldr x19, [%x[inptrs], #0x38]\n"
- "zip1 v12.2d, v12.2d, v17.2d\n"
- "ld1r { v3.4s }, [x22]\n"
- "mov v18.16b, v6.16b\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov v15.16b, v4.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
"ld1 { v2.16b }, [x20]\n"
- "zip1 v7.2d, v7.2d, v16.2d\n"
- "ld1 { v1.16b }, [x19]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "mov v17.16b, v5.16b\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "mov v16.16b, v4.16b\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "zip1 v6.2d, v6.2d, v18.2d\n"
- "ld1r { v0.4s }, [x9]\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ld1r { v31.4s }, [x28]\n"
- "zip1 v5.2d, v5.2d, v17.2d\n"
- "ld1r { v30.4s }, [x27]\n"
- "mov v17.16b, v2.16b\n"
- "ldr q29, [%x[params], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "mov v20.16b, v2.16b\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x1\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "zip1 v3.2d, v3.2d, v16.2d\n"
+ "zip1 v4.2d, v4.2d, v15.2d\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "mov v26.16b, v1.16b\n"
+ "mov v13.16b, v5.16b\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x38]\n"
+ "mov v19.16b, v6.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v7.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "mov v17.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v20.2d\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6f83e392 // udot v18.4s, v28.16b, v3.4b[0]\n"
"ext v17.16b, v17.16b, v17.16b, #0x1\n"
- "ldr q8, [%x[params], #0x10]\n"
- "zip1 v4.2d, v4.2d, v16.2d\n"
- "ldr q9, [%x[params], #0x20]\n"
- "mov v16.16b, v1.16b\n"
- "ldr q10, [%x[params], #0x30]\n"
+ ".inst 0x6f83eb9f // udot v31.4s, v28.16b, v3.4b[2]\n"
+ ".inst 0x6f84e398 // udot v24.4s, v28.16b, v4.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ ".inst 0x6f84eb9e // udot v30.4s, v28.16b, v4.4b[2]\n"
+ "mov v16.16b, v0.16b\n"
+ ".inst 0x6f82e395 // udot v21.4s, v28.16b, v2.4b[0]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v29.4s, #0x1\n"
+ ".inst 0x6f82eb94 // udot v20.4s, v28.16b, v2.4b[2]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
"ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "ldr q11, [%x[params], #0x40]\n"
- "add %x[params], %x[params], #0x50\n"
- "zip1 v2.2d, v2.2d, v17.2d\n"
- "movi v23.4s, #0x0\n"
+ "zip1 v1.2d, v1.2d, v26.2d\n"
+ ".inst 0x6fa3e3b2 // udot v18.4s, v29.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v13.2d\n"
+ "zip1 v6.2d, v6.2d, v19.2d\n"
+ ".inst 0x6fa3ebbf // udot v31.4s, v29.16b, v3.4b[3]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v17.2d\n"
"movi v22.4s, #0x0\n"
- "zip1 v1.2d, v1.2d, v16.2d\n"
- "movi v21.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
+ ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x6fa4ebbe // udot v30.4s, v29.16b, v4.4b[3]\n"
+ ".inst 0x6f81e396 // udot v22.4s, v28.16b, v1.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "movi v25.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ ".inst 0x6f81eb9a // udot v26.4s, v28.16b, v1.4b[2]\n"
+ "zip1 v0.2d, v0.2d, v16.2d\n"
"movi v19.4s, #0x0\n"
- ".inst 0x6f8de1fc // udot v28.4s, v15.16b, v13.4b[0]\n"
- ".inst 0x6f8de9fb // udot v27.4s, v15.16b, v13.4b[2]\n"
- ".inst 0x6f8ce1fa // udot v26.4s, v15.16b, v12.4b[0]\n"
- ".inst 0x6f8ce9f9 // udot v25.4s, v15.16b, v12.4b[2]\n"
- ".inst 0x6fade1dc // udot v28.4s, v14.16b, v13.4b[1]\n"
- ".inst 0x6fade9db // udot v27.4s, v14.16b, v13.4b[3]\n"
- ".inst 0x6face1da // udot v26.4s, v14.16b, v12.4b[1]\n"
- ".inst 0x6face9d9 // udot v25.4s, v14.16b, v12.4b[3]\n"
- ".inst 0x6f87e1f8 // udot v24.4s, v15.16b, v7.4b[0]\n"
- ".inst 0x6f87e9f7 // udot v23.4s, v15.16b, v7.4b[2]\n"
- ".inst 0x6f86e1f6 // udot v22.4s, v15.16b, v6.4b[0]\n"
- ".inst 0x6f86e9f5 // udot v21.4s, v15.16b, v6.4b[2]\n"
- ".inst 0x6fa7e1d8 // udot v24.4s, v14.16b, v7.4b[1]\n"
- ".inst 0x6fa7e9d7 // udot v23.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x6fa6e1d6 // udot v22.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x6fa6e9d5 // udot v21.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6f85e1f2 // udot v18.4s, v15.16b, v5.4b[0]\n"
- ".inst 0x6f85e9f1 // udot v17.4s, v15.16b, v5.4b[2]\n"
- ".inst 0x6f84e1f0 // udot v16.4s, v15.16b, v4.4b[0]\n"
- ".inst 0x6f84e9f4 // udot v20.4s, v15.16b, v4.4b[2]\n"
- ".inst 0x6fa5e1d2 // udot v18.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa5e9d1 // udot v17.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa4e1d0 // udot v16.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa4e9d4 // udot v20.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6f82e1f3 // udot v19.4s, v15.16b, v2.4b[0]\n"
- "mov v28.16b, v28.16b\n"
- "mov v27.16b, v27.16b\n"
- "add v28.4s, v28.4s, v26.4s\n"
- ".inst 0x6fa2e1d3 // udot v19.4s, v14.16b, v2.4b[1]\n"
- "add v27.4s, v27.4s, v25.4s\n"
- "add v28.4s, v28.4s, v24.4s\n"
- "mov v26.16b, v26.16b\n"
- "add v27.4s, v27.4s, v23.4s\n"
- "add v28.4s, v28.4s, v22.4s\n"
- "mov v25.16b, v25.16b\n"
- "add v27.4s, v27.4s, v21.4s\n"
- "add v28.4s, v28.4s, v18.4s\n"
- "add v26.4s, v26.4s, v24.4s\n"
- "add v27.4s, v27.4s, v17.4s\n"
- "add v25.4s, v25.4s, v23.4s\n"
- "add v26.4s, v26.4s, v22.4s\n"
- "mov v24.16b, v24.16b\n"
- "add v25.4s, v25.4s, v21.4s\n"
- "add v26.4s, v26.4s, v18.4s\n"
- "mov v23.16b, v23.16b\n"
- "add v25.4s, v25.4s, v17.4s\n"
- "add v26.4s, v26.4s, v16.4s\n"
- "add v24.4s, v24.4s, v22.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v23.4s, v23.4s, v21.4s\n"
- "add v24.4s, v24.4s, v18.4s\n"
- "mov v22.16b, v22.16b\n"
- "add v23.4s, v23.4s, v17.4s\n"
- "add v24.4s, v24.4s, v16.4s\n"
- "mov v21.16b, v21.16b\n"
- "add v23.4s, v23.4s, v20.4s\n"
- "add v24.4s, v24.4s, v19.4s\n"
- "add v22.4s, v22.4s, v18.4s\n"
+ ".inst 0x6f85e399 // udot v25.4s, v28.16b, v5.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x6f85eb9b // udot v27.4s, v28.16b, v5.4b[2]\n"
+ ".inst 0x6f86e393 // udot v19.4s, v28.16b, v6.4b[0]\n"
+ "add v24.4s, v18.4s, v24.4s\n"
+ "mov x9, #0x0\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f82e9f2 // udot v18.4s, v15.16b, v2.4b[2]\n"
- "add v21.4s, v21.4s, v17.4s\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6f81e1f1 // udot v17.4s, v15.16b, v1.4b[0]\n"
- ".inst 0x6fa2e9d2 // udot v18.4s, v14.16b, v2.4b[3]\n"
- "add v22.4s, v22.4s, v16.4s\n"
+ ".inst 0x6f86eb92 // udot v18.4s, v28.16b, v6.4b[2]\n"
+ ".inst 0x6fa2e3b5 // udot v21.4s, v29.16b, v2.4b[1]\n"
+ "mov x28, #0x0\n"
+ ".inst 0x6fa2ebb4 // udot v20.4s, v29.16b, v2.4b[3]\n"
+ "add v17.4s, v31.4s, v30.4s\n"
+ ".inst 0x6fa1e3b6 // udot v22.4s, v29.16b, v1.4b[1]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
"movi v16.4s, #0x0\n"
- ".inst 0x6fa1e1d1 // udot v17.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6f81e9f0 // udot v16.4s, v15.16b, v1.4b[2]\n"
- "add v23.4s, v23.4s, v18.4s\n"
- "add v21.4s, v21.4s, v20.4s\n"
- "add v22.4s, v22.4s, v19.4s\n"
- ".inst 0x6fa1e9d0 // udot v16.4s, v14.16b, v1.4b[3]\n"
- "add v21.4s, v21.4s, v18.4s\n"
- "add v22.4s, v22.4s, v17.4s\n"
- "neg v3.4s, v3.4s\n"
- "add v21.4s, v21.4s, v16.4s\n"
- "mul v28.4s, v28.4s, v3.4s\n"
- "str q28, [SP, #0x0]\n"
- "mul v27.4s, v27.4s, v3.4s\n"
- "mul v26.4s, v26.4s, v3.4s\n"
- "str q27, [SP, #0x10]\n"
- "mul v25.4s, v25.4s, v3.4s\n"
- "mul v24.4s, v24.4s, v3.4s\n"
- "str q26, [SP, #0x20]\n"
- "mul v23.4s, v23.4s, v3.4s\n"
- "str q25, [SP, #0x30]\n"
- "mul v22.4s, v22.4s, v3.4s\n"
- "mul v21.4s, v21.4s, v3.4s\n"
- "str q24, [SP, #0x40]\n"
- "add v28.4s, v28.4s, v29.4s\n"
- "str q23, [SP, #0x50]\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "str q22, [SP, #0x60]\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q21, [SP, #0x70]\n"
- "add v24.4s, v24.4s, v29.4s\n"
- "add v23.4s, v23.4s, v29.4s\n"
- "add v22.4s, v22.4s, v29.4s\n"
- "add v21.4s, v21.4s, v29.4s\n"
+ ".inst 0x6f87e390 // udot v16.4s, v28.16b, v7.4b[0]\n"
+ ".inst 0x6fa1ebba // udot v26.4s, v29.16b, v1.4b[3]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ ".inst 0x6fa5e3b9 // udot v25.4s, v29.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ebbb // udot v27.4s, v29.16b, v5.4b[3]\n"
+ "add v30.4s, v21.4s, v24.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ ".inst 0x6fa6e3b3 // udot v19.4s, v29.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ebb2 // udot v18.4s, v29.16b, v6.4b[3]\n"
+ "add v31.4s, v20.4s, v17.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ ".inst 0x6fa7e3b0 // udot v16.4s, v29.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v30.4s\n"
+ "add %x[params], %x[params], #0x50\n"
+ "add v21.4s, v26.4s, v31.4s\n"
+ "add v20.4s, v25.4s, v19.4s\n"
+ "add v19.4s, v27.4s, v18.4s\n"
+ "add v18.4s, v16.4s, v24.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6f87eb90 // udot v16.4s, v28.16b, v7.4b[2]\n"
+ ".inst 0x6fa7ebb0 // udot v16.4s, v29.16b, v7.4b[3]\n"
+ "add v17.4s, v16.4s, v17.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e3b0 // udot v16.4s, v29.16b, v0.4b[1]\n"
+ "add v24.4s, v22.4s, v16.4s\n"
+ "add v26.4s, v22.4s, v25.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n"
+ ".inst 0x6fa0ebb0 // udot v16.4s, v29.16b, v0.4b[3]\n"
+ "add v25.4s, v21.4s, v16.4s\n"
+ "add v27.4s, v21.4s, v27.4s\n"
+ "add v28.4s, v20.4s, v30.4s\n"
+ "add v29.4s, v19.4s, v31.4s\n"
+ "add v30.4s, v18.4s, v20.4s\n"
+ "add v31.4s, v17.4s, v19.4s\n"
+ "neg v23.4s, v23.4s\n"
+ "mul v24.4s, v24.4s, v23.4s\n"
+ "mul v25.4s, v25.4s, v23.4s\n"
+ "mul v26.4s, v26.4s, v23.4s\n"
+ "mul v27.4s, v27.4s, v23.4s\n"
+ "mul v28.4s, v28.4s, v23.4s\n"
+ "mul v29.4s, v29.4s, v23.4s\n"
+ "mul v30.4s, v30.4s, v23.4s\n"
+ "mul v31.4s, v31.4s, v23.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
"ble 2f\n"
"1:" // Loop
- ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n"
- "ldr q20, [%x[params], #0x60]\n"
- "add x11, x11, #0x10\n"
- ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n"
- "ldr q19, [%x[params], #0x70]\n"
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
+ "ldr q12, [%x[params], #0x80]\n"
+ ".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n"
- "ldr q29, [%x[params], #0x80]\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n"
- ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
"ldr q8, [%x[params], #0x0]\n"
- ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n"
- ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n"
- ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n"
- ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n"
- ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q9, [%x[params], #0x10]\n"
- ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n"
- ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n"
- ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n"
- ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n"
- ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n"
- ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n"
- ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
"ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n"
- ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
"ldr q9, [%x[params], #0x50]\n"
- ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n"
- "ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%x[params], #0x90]\n"
- ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
- "add %x[params], %x[params], #0xd0\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqrdmulh v27.4s, v27.4s, v20.4s\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "sqrdmulh v24.4s, v24.4s, v20.4s\n"
- "and v18.16b, v28.16b, v19.16b\n"
- "and v17.16b, v27.16b, v19.16b\n"
- "and v16.16b, v26.16b, v19.16b\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v16.16b, v27.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v26.4s, v26.4s, v16.4s\n"
- "and v16.16b, v25.16b, v19.16b\n"
- "srshl v28.4s, v28.4s, v19.4s\n"
- "srshl v27.4s, v27.4s, v19.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v27.4s, v27.4s, v0.4s\n"
- "add v26.4s, v26.4s, v0.4s\n"
- "smin v28.4s, v28.4s, v30.4s\n"
- "smin v27.4s, v27.4s, v30.4s\n"
- "smin v26.4s, v26.4s, v30.4s\n"
- "smax v28.4s, v28.4s, v31.4s\n"
- "smax v27.4s, v27.4s, v31.4s\n"
- "smax v26.4s, v26.4s, v31.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x26, x10]\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "ldr q28, [SP, #0x0]\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "str s27, [x25, x10]\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "ldr q27, [SP, #0x10]\n"
- "and v16.16b, v24.16b, v19.16b\n"
- "str s26, [x24, x10]\n"
- "sqrdmulh v23.4s, v23.4s, v20.4s\n"
- "ldr q26, [SP, #0x20]\n"
- "srshl v25.4s, v25.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "and v17.16b, v23.16b, v19.16b\n"
- "add v25.4s, v25.4s, v0.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v30.4s\n"
- "and v16.16b, v22.16b, v19.16b\n"
- "srshl v24.4s, v24.4s, v19.4s\n"
- "smax v25.4s, v25.4s, v31.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "and v16.16b, v31.16b, v20.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v0.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x23, x10]\n"
- "smin v24.4s, v24.4s, v30.4s\n"
- "srshl v23.4s, v23.4s, v19.4s\n"
- "ldr q25, [SP, #0x30]\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v24.4s, v24.4s, v31.4s\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "srshl v22.4s, v22.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "smin v23.4s, v23.4s, v30.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x22, x10]\n"
- "smax v23.4s, v23.4s, v31.4s\n"
- "add v22.4s, v22.4s, v0.4s\n"
- "ldr q24, [SP, #0x40]\n"
- "and v16.16b, v21.16b, v19.16b\n"
- "add v28.4s, v28.4s, v29.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v22.4s, v22.4s, v30.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x21, x10]\n"
- "smax v22.4s, v22.4s, v31.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr q23, [SP, #0x50]\n"
- "add v27.4s, v27.4s, v29.4s\n"
- "add v26.4s, v26.4s, v29.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x20, x10]\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "add v24.4s, v24.4s, v29.4s\n"
- "ldr q22, [SP, #0x60]\n"
- "srshl v21.4s, v21.4s, v19.4s\n"
- "add v23.4s, v23.4s, v29.4s\n"
- "add v21.4s, v21.4s, v0.4s\n"
- "add v22.4s, v22.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v30.4s\n"
- "smax v21.4s, v21.4s, v31.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x19, x10]\n"
- "add x10, x10, #0x4\n"
- "ldr q21, [SP, #0x70]\n"
- "add v21.4s, v21.4s, v29.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
"bgt 1b\n"
"2:" // Tail
- ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n"
- "ldr q20, [%x[params], #0x60]\n"
- "add x26, x26, x10\n"
- ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n"
- "ldr q19, [%x[params], #0x70]\n"
- "add x25, x25, x10\n"
- ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n"
- "add x24, x24, x10\n"
- ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n"
- "add x23, x23, x10\n"
- ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n"
- "add x22, x22, x10\n"
- ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n"
- "add x21, x21, x10\n"
- ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n"
- "add x20, x20, x10\n"
- ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%x[params], #0x0]\n"
- "add x19, x19, x10\n"
- ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n"
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"cmp %x[n_channels], #0x4\n"
- ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n"
- ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n"
- ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n"
- ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n"
+ "add x27, x27, x28\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "add x20, x20, x28\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q9, [%x[params], #0x10]\n"
- ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n"
- ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n"
- ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n"
- ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q10, [%x[params], #0x20]\n"
- ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n"
- ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n"
- ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n"
- ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n"
- ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e11a // udot v26.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91b // udot v27.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q11, [%x[params], #0x30]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e939 // udot v25.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13a // udot v26.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93b // udot v27.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x6f84e11c // udot v28.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e91d // udot v29.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6f85e11e // udot v30.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91f // udot v31.4s, v8.16b, v5.4b[2]\n"
"ldr q8, [%x[params], #0x40]\n"
- ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n"
- ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n"
- ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n"
- ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n"
- ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n"
- ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6f83e158 // udot v24.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e959 // udot v25.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15a // udot v26.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95b // udot v27.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e13c // udot v28.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e93d // udot v29.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e13e // udot v30.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93f // udot v31.4s, v9.16b, v5.4b[3]\n"
"ldr q9, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n"
- ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n"
- ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n"
- ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n"
- ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n"
- ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n"
- ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n"
- ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6fa3e178 // udot v24.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e979 // udot v25.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17a // udot v26.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97b // udot v27.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x6f85e15c // udot v28.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e95d // udot v29.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6f86e15e // udot v30.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e95f // udot v31.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x6f84e118 // udot v24.4s, v8.16b, v4.4b[0]\n"
".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n"
- ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
- ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6f85e11a // udot v26.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e17c // udot v28.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e97d // udot v29.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e17e // udot v30.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e97f // udot v31.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6fa4e138 // udot v24.4s, v9.16b, v4.4b[1]\n"
".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
- ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
- ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n"
- ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqrdmulh v27.4s, v27.4s, v20.4s\n"
- "sqrdmulh v26.4s, v26.4s, v20.4s\n"
- "sqrdmulh v25.4s, v25.4s, v20.4s\n"
- "and v18.16b, v28.16b, v19.16b\n"
- "and v17.16b, v27.16b, v19.16b\n"
- "and v16.16b, v26.16b, v19.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6fa5e13a // udot v26.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f86e11c // udot v28.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x6f86e91d // udot v29.4s, v8.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f87e11e // udot v30.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x6f87e91f // udot v31.4s, v8.16b, v7.4b[2]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6fa6e13c // udot v28.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e93d // udot v29.4s, v9.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6fa7e13e // udot v30.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e93f // udot v31.4s, v9.16b, v7.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v28.4s, v28.4s, v18.4s\n"
- "sqadd v27.4s, v27.4s, v17.4s\n"
- "sqadd v26.4s, v26.4s, v16.4s\n"
- "and v16.16b, v25.16b, v19.16b\n"
- "srshl v28.4s, v28.4s, v19.4s\n"
- "srshl v27.4s, v27.4s, v19.4s\n"
- "srshl v26.4s, v26.4s, v19.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v28.4s, v28.4s, v0.4s\n"
- "add v27.4s, v27.4s, v0.4s\n"
- "add v26.4s, v26.4s, v0.4s\n"
- "smin v28.4s, v28.4s, v30.4s\n"
- "smin v27.4s, v27.4s, v30.4s\n"
- "smin v26.4s, v26.4s, v30.4s\n"
- "smax v28.4s, v28.4s, v31.4s\n"
- "smax v27.4s, v27.4s, v31.4s\n"
- "smax v26.4s, v26.4s, v31.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sqrdmulh v24.4s, v24.4s, v20.4s\n"
- "sqrdmulh v23.4s, v23.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v19.4s\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "and v16.16b, v24.16b, v19.16b\n"
- "and v17.16b, v23.16b, v19.16b\n"
- "add v25.4s, v25.4s, v0.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "smin v25.4s, v25.4s, v30.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "smax v25.4s, v25.4s, v31.4s\n"
- "and v16.16b, v22.16b, v19.16b\n"
- "srshl v24.4s, v24.4s, v19.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "srshl v23.4s, v23.4s, v19.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v0.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v23.4s, v23.4s, v0.4s\n"
- "smin v24.4s, v24.4s, v30.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v30.4s\n"
- "smax v24.4s, v24.4s, v31.4s\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "smax v23.4s, v23.4s, v31.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "and v16.16b, v21.16b, v19.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "add v22.4s, v22.4s, v0.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v22.4s, v22.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v31.4s\n"
- "srshl v21.4s, v21.4s, v19.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "add v21.4s, v21.4s, v0.4s\n"
- "smin v21.4s, v21.4s, v30.4s\n"
- "smax v21.4s, v21.4s, v31.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"blt 3f\n"
- "str s28, [x26, #0x0]\n"
- "str s27, [x25, #0x0]\n"
- "str s26, [x24, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s24, [x22, #0x0]\n"
- "str s23, [x21, #0x0]\n"
- "str s22, [x20, #0x0]\n"
- "str s21, [x19, #0x0]\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
"b 4f\n"
"3:" // Tail: Oddments
- "st1 { v28.b }[0], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v25.b }[0], [x23], #0x1\n"
- "st1 { v24.b }[0], [x22], #0x1\n"
- "st1 { v23.b }[0], [x21], #0x1\n"
- "st1 { v22.b }[0], [x20], #0x1\n"
- "st1 { v21.b }[0], [x19], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[1], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[1], [x25], #0x1\n"
- "st1 { v26.b }[1], [x24], #0x1\n"
- "st1 { v25.b }[1], [x23], #0x1\n"
- "st1 { v24.b }[1], [x22], #0x1\n"
- "st1 { v23.b }[1], [x21], #0x1\n"
- "st1 { v22.b }[1], [x20], #0x1\n"
- "st1 { v21.b }[1], [x19], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[2], [x26], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v25.b }[2], [x23], #0x1\n"
- "st1 { v24.b }[2], [x22], #0x1\n"
- "st1 { v23.b }[2], [x21], #0x1\n"
- "st1 { v22.b }[2], [x20], #0x1\n"
- "st1 { v21.b }[2], [x19], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"beq 4f\n"
- "st1 { v28.b }[3], [x26], #0x1\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
"subs %x[n_channels], %x[n_channels], #0x1\n"
- "st1 { v27.b }[3], [x25], #0x1\n"
- "st1 { v26.b }[3], [x24], #0x1\n"
- "st1 { v25.b }[3], [x23], #0x1\n"
- "st1 { v24.b }[3], [x22], #0x1\n"
- "st1 { v23.b }[3], [x21], #0x1\n"
- "st1 { v22.b }[3], [x20], #0x1\n"
- "st1 { v21.b }[3], [x19], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
"4:" // Tail: End
- "add SP, SP, #0x80\n"
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index e896304c59..bbb817a883 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,1439 +45,1433 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
)
{
__asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v3.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
"mov x9, #0x0\n"
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v13.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v11.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v9.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v8.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v7.4s }, [x19]\n"
- "lsr x28, %x[n_output_channels], #0x2\n"
- "cbz x28, 9f\n"
+ "cbz x10, 9f\n"
"1:" // Output channel loop
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q16, [%x[bias], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 3f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q8, [%x[rq_mul_ptr], x19]\n"
- "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 7f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 7f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v16.8h, v16.8b, v11.8b\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "6:" // Output channel loop: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldr d7, [x28, #0x0]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "7:" // Output channel loop: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
- "cmp x9, x28, LSL #2\n"
+ "cmp x9, x10, LSL #2\n"
"blt 1b\n"
"tst %x[n_output_channels], #0x3\n"
"beq 26f\n"
"9:" // Output channel oddments
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 12f\n"
- "add x19, %x[bias], x9, LSL #2\n"
+ "add x20, %x[bias], x9, LSL #2\n"
"tbz %x[n_output_channels], #1, 10f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v31.s }[2], [x20]\n"
"b 11f\n"
"10:" // Output channel oddments: Load bias: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "ld1 { v31.s }[0], [x20]\n"
"11:" // Output channel oddments: Load bias: Bit 1: End
-
"12:" // Output channel oddments: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 18f\n"
- "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
-
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 22f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 22f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "usubl v16.8h, v16.8b, v11.8b\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "usubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.h }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.h }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.h }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.h }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.h }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.h }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.h }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.h }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.h }[0], [x25]\n"
+ "st1 { v23.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v19.h }[0], [x26]\n"
+ "st1 { v24.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x21]\n"
+ "st1 { v26.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x23]\n"
+ "st1 { v28.h }[0], [x24]\n"
+ "st1 { v29.h }[0], [x25]\n"
+ "st1 { v30.h }[0], [x26]\n"
+ "st1 { v31.h }[0], [x27]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[2], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[2], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v19.b }[2], [x26]\n"
+ "st1 { v23.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v25.b }[2], [x21]\n"
+ "st1 { v26.b }[2], [x22]\n"
+ "st1 { v27.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x24]\n"
+ "st1 { v29.b }[2], [x25]\n"
+ "st1 { v30.b }[2], [x26]\n"
+ "st1 { v31.b }[2], [x27]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[0], [x25]\n"
- "st1 { v19.b }[0], [x26]\n"
+ "st1 { v23.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[0], [x20]\n"
+ "st1 { v25.b }[0], [x21]\n"
+ "st1 { v26.b }[0], [x22]\n"
+ "st1 { v27.b }[0], [x23]\n"
+ "st1 { v28.b }[0], [x24]\n"
+ "st1 { v29.b }[0], [x25]\n"
+ "st1 { v30.b }[0], [x26]\n"
+ "st1 { v31.b }[0], [x27]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
"26:" // Done
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 89cb2ec380..afc6695ff1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,65 +91,65 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x19, %[offsetof_Requantize32_c_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "ld1r { v15.16b }, [x23]\n"
- "ld1r { v13.8h }, [x22]\n"
- "lsr x16, x8, #0x3\n"
- "mov x15, #0x0\n"
- "ld1r { v11.8h }, [x20]\n"
- "ld1r { v25.8h }, [x19]\n"
- "mov x14, #0x0\n"
- "add x13, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q12, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v14.16b, v12.16b\n"
- "ldr q17, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v9.16b, v17.16b\n"
- "mov v16.16b, v12.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v10.16b, v17.16b\n"
- "mov v18.16b, v12.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v26.16b, v17.16b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldp x23, x22, [x13, #0x0]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x19, [x13, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v20.8h }, [x21]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x17, #0x0\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "ldr q14, [x27, #0x0]\n"
+ "ldr q11, [x27, #0x10]\n"
+ "add x27, x27, #0x20\n"
+ "str x27, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr d31, [x24, x17]\n"
+ "ldr d30, [x23, x17]\n"
+ "mov v24.16b, v14.16b\n"
+ "mov v17.16b, v11.16b\n"
+ "ldr d29, [x22, x17]\n"
+ "ldr d28, [x21, x17]\n"
+ "mov v23.16b, v14.16b\n"
+ "mov v25.16b, v11.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d27, [x20, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
"ushll v29.8h, v29.8b, #0x0\n"
@@ -157,226 +157,226 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"ushll v27.8h, v27.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "smlal v12.4s, v31.4h, v4.4h\n"
- "smlal2 v17.4s, v31.8h, v4.8h\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x26, [x13, #0x38]\n"
- "smlal v14.4s, v31.4h, v3.4h\n"
+ "ldr q22, [x13, #0x0]\n"
+ "ldr q10, [x12, #0x0]\n"
+ "smlal v14.4s, v31.4h, v4.4h\n"
+ "smlal2 v11.4s, v31.8h, v4.8h\n"
+ "ldr q18, [x13, #0x10]\n"
+ "ldr q26, [x12, #0x10]\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
"smlal2 v9.4s, v31.8h, v3.8h\n"
- "ldr x20, [x13, #0x30]\n"
- "ldr x25, [x13, #0x40]\n"
- "smlal v12.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr x19, [x13, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal2 v11.4s, v30.8h, v0.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "smlal v16.4s, v29.4h, v2.4h\n"
"smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "ldr x24, [x13, #0x50]\n"
- "ldr x23, [x13, #0x58]\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "smlal2 v26.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v24.4s, v31.4h, v1.4h\n"
+ "smlal2 v17.4s, v31.8h, v1.8h\n"
+ "ldr x26, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr d30, [x20, x17]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v25.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v12.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x22, [x13, #0x60]\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
"smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr x20, [x13, #0x70]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ldr x19, [x13, #0x78]\n"
- "ldr q21, [x12, #0x0]\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x24, [x15, #0x58]\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v17.4s, v28.8h, v2.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v25.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x21, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v12.4s, v27.4h, v7.4h\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "ldr q24, [x11, #0x0]\n"
- "ldr q19, [x12, #0x10]\n"
- "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal v14.4s, v27.4h, v7.4h\n"
+ "smlal2 v11.4s, v27.8h, v7.8h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v16.4s, v27.4h, v6.4h\n"
"smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr q23, [x11, #0x10]\n"
- "add x17, x17, #0x48\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v10.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v24.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x26, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v18.4s, v27.4h, v3.4h\n"
- "smlal2 v26.4s, v27.8h, v3.8h\n"
- "subs x16, x16, #0x1\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v25.4s, v27.8h, v3.8h\n"
+ "add x14, x14, #0x48\n"
+ "subs x8, x8, #0x1\n"
+ "smlal v14.4s, v28.4h, v1.4h\n"
+ "smlal2 v11.4s, v28.8h, v1.8h\n"
+ "add x13, x13, #0x20\n"
"add x12, x12, #0x20\n"
- "smlal v12.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "add x11, x11, #0x20\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
"smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
+ "ldr d28, [x24, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal2 v26.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal v24.4s, v27.4h, v4.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v17.4s, v27.8h, v4.8h\n"
+ "smlal2 v25.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v12.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal2 v11.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
"smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
+ "ldr d31, [x23, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v18.4s, v30.4h, v4.4h\n"
- "smlal v12.4s, v30.4h, v8.4h\n"
- "smlal2 v17.4s, v30.8h, v8.8h\n"
- "smlal v14.4s, v30.4h, v7.4h\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v16.4s, v30.4h, v7.4h\n"
"smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal2 v26.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
+ "smlal2 v17.4s, v30.8h, v5.8h\n"
+ "smlal2 v25.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x17]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal v12.4s, v29.4h, v3.4h\n"
- "smlal2 v17.4s, v29.8h, v3.8h\n"
- "smlal2 v10.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "smlal v24.4s, v29.4h, v0.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal v14.4s, v29.4h, v3.4h\n"
+ "smlal2 v11.4s, v29.8h, v3.8h\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x17]\n"
+ "smlal2 v25.4s, v28.8h, v2.8h\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal v24.4s, v31.4h, v3.4h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal v16.4s, v28.4h, v5.4h\n"
"smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
+ "ldr d28, [x20, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v10.4s, v31.8h, v3.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal v18.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v7.8h\n"
- "smlal2 v26.4s, v29.8h, v6.8h\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "sqrdmulh v12.4s, v12.4s, v21.4s\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal v18.4s, v28.4h, v7.4h\n"
- "sqrdmulh v14.4s, v14.4s, v21.4s\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "smlal2 v10.4s, v28.8h, v8.8h\n"
- "smlal2 v26.4s, v28.8h, v7.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "and v29.16b, v12.16b, v24.16b\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
- "and v22.16b, v14.16b, v24.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "and v21.16b, v16.16b, v24.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v18.16b, v24.16b\n"
- "sqrdmulh v26.4s, v26.4s, v19.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v19.16b, v17.16b, v23.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v30.16b, v9.16b, v23.16b\n"
+ "smlal v14.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v14.4s, v14.4s, v22.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v25.4s, v30.8h, v5.8h\n"
+ "smlal v24.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v14.16b, v10.16b\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v11.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "smlal2 v17.4s, v29.8h, v7.8h\n"
+ "smlal2 v25.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v10.16b, v23.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v28.16b, v26.16b, v23.16b\n"
- "sqadd v12.4s, v12.4s, v29.4s\n"
+ "smlal v16.4s, v30.4h, v8.4h\n"
+ "smlal v24.4s, v28.4h, v8.4h\n"
+ "and v4.16b, v11.16b, v26.16b\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v22.4s\n"
+ "smlal2 v17.4s, v28.8h, v8.8h\n"
+ "smlal2 v25.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v22.4s\n"
+ "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v19.16b, v16.16b, v10.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "and v3.16b, v24.16b, v10.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "and v21.16b, v23.16b, v10.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v21.4s\n"
+ "and v27.16b, v9.16b, v26.16b\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v20.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v12.4s, v12.4s, v24.4s\n"
- "sqadd v17.4s, v17.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v24.4s\n"
- "sqadd v9.4s, v9.4s, v30.4s\n"
- "srshl v16.4s, v16.4s, v24.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v24.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "srshl v17.4s, v17.4s, v23.4s\n"
- "sqxtn v12.4h, v12.4s\n"
- "srshl v9.4s, v9.4s, v23.4s\n"
+ "and v5.16b, v17.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v4.16b, v25.16b, v26.16b\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v3.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v14.4s, v14.4s, v10.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v25.4s, v25.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v26.4s\n"
"sqxtn v14.4h, v14.4s\n"
- "srshl v10.4s, v10.4s, v23.4s\n"
+ "srshl v9.4s, v9.4s, v26.4s\n"
"sqxtn v16.4h, v16.4s\n"
- "srshl v26.4s, v26.4s, v23.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "sqxtn2 v12.8h, v17.4s\n"
- "sqxtn2 v14.8h, v9.4s\n"
- "sqxtn2 v16.8h, v10.4s\n"
- "sqxtn2 v18.8h, v26.4s\n"
- "sqadd v12.8h, v12.8h, v13.8h\n"
- "sqadd v14.8h, v14.8h, v13.8h\n"
- "sqadd v16.8h, v16.8h, v13.8h\n"
- "sqadd v18.8h, v18.8h, v13.8h\n"
- "smax v12.8h, v12.8h, v11.8h\n"
- "smax v14.8h, v14.8h, v11.8h\n"
- "smax v16.8h, v16.8h, v11.8h\n"
- "smax v18.8h, v18.8h, v11.8h\n"
- "smin v12.8h, v12.8h, v25.8h\n"
- "smin v14.8h, v14.8h, v25.8h\n"
- "smin v16.8h, v16.8h, v25.8h\n"
- "smin v18.8h, v18.8h, v25.8h\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "srshl v17.4s, v17.4s, v26.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v25.4s, v25.4s, v26.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v14.8h, v11.4s\n"
+ "sqxtn2 v16.8h, v9.4s\n"
+ "sqxtn2 v24.8h, v17.4s\n"
+ "sqxtn2 v23.8h, v25.4s\n"
+ "sqadd v14.8h, v14.8h, v20.8h\n"
+ "sqadd v16.8h, v16.8h, v20.8h\n"
+ "sqadd v24.8h, v24.8h, v20.8h\n"
+ "sqadd v23.8h, v23.8h, v20.8h\n"
+ "smax v14.8h, v14.8h, v15.8h\n"
+ "smax v16.8h, v16.8h, v15.8h\n"
+ "smax v24.8h, v24.8h, v15.8h\n"
+ "smax v23.8h, v23.8h, v15.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v16.8h, v16.8h, v13.8h\n"
+ "smin v24.8h, v24.8h, v13.8h\n"
+ "smin v23.8h, v23.8h, v13.8h\n"
"uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d12, [x10, x14]\n"
+ "str d14, [x11, x16]\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d14, [x9, x14]\n"
- "str d16, [x28, x14]\n"
- "str d18, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q12, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q17, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v14.16b, v12.16b\n"
- "mov v9.16b, v17.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v16.16b, v12.16b\n"
- "mov v10.16b, v17.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v18.16b, v12.16b\n"
- "mov v26.16b, v17.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x23, x22, [x13, #0x0]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x19, [x13, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d16, [x10, x16]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x9, x16]\n"
+ "str d23, [x28, x16]\n"
+ "ldr q14, [x27, #0x0]\n"
+ "ldr q11, [x27, #0x10]\n"
+ "add x27, x27, #0x20\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x27, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v24.16b, v14.16b\n"
+ "mov v17.16b, v11.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v23.16b, v14.16b\n"
+ "mov v25.16b, v11.16b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr d31, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d30, [x23, x17]\n"
+ "ldr d29, [x22, x17]\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
"ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
"ushll v29.8h, v29.8b, #0x0\n"
@@ -384,777 +384,777 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"ushll v27.8h, v27.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v12.4s, v31.4h, v4.4h\n"
- "smlal2 v17.4s, v31.8h, v4.8h\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x26, [x13, #0x38]\n"
- "smlal v14.4s, v31.4h, v3.4h\n"
+ "ldr q22, [x13, #0x0]\n"
+ "ldr q10, [x12, #0x0]\n"
+ "smlal v14.4s, v31.4h, v4.4h\n"
+ "smlal2 v11.4s, v31.8h, v4.8h\n"
+ "ldr q18, [x13, #0x10]\n"
+ "ldr q26, [x12, #0x10]\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
"smlal2 v9.4s, v31.8h, v3.8h\n"
- "ldr x20, [x13, #0x30]\n"
- "ldr x25, [x13, #0x40]\n"
- "smlal v12.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
- "ldr x19, [x13, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal2 v11.4s, v30.8h, v0.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "smlal v16.4s, v29.4h, v2.4h\n"
"smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "ldr x24, [x13, #0x50]\n"
- "ldr x23, [x13, #0x58]\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "smlal2 v26.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v24.4s, v31.4h, v1.4h\n"
+ "smlal2 v17.4s, v31.8h, v1.8h\n"
+ "ldr x26, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr d30, [x20, x17]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v25.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v12.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x22, [x13, #0x60]\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
"smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr x21, [x13, #0x68]\n"
- "ldr x20, [x13, #0x70]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ldr x19, [x13, #0x78]\n"
- "ldr q21, [x12, #0x0]\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x24, [x15, #0x58]\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v17.4s, v28.8h, v2.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v25.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x21, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v12.4s, v27.4h, v7.4h\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "ldr q24, [x11, #0x0]\n"
- "ldr q19, [x12, #0x10]\n"
- "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal v14.4s, v27.4h, v7.4h\n"
+ "smlal2 v11.4s, v27.8h, v7.8h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v16.4s, v27.4h, v6.4h\n"
"smlal2 v9.4s, v27.8h, v6.8h\n"
- "ldr q23, [x11, #0x10]\n"
- "tst x8, #0x7\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v10.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal v24.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x26, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v18.4s, v27.4h, v3.4h\n"
- "smlal2 v26.4s, v27.8h, v3.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v25.4s, v27.8h, v3.8h\n"
+ "add x13, x13, #0x20\n"
"add x12, x12, #0x20\n"
- "add x11, x11, #0x20\n"
- "smlal v12.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal v14.4s, v28.4h, v1.4h\n"
+ "smlal2 v11.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
"smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
+ "ldr d28, [x24, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal2 v26.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal v24.4s, v27.4h, v4.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v17.4s, v27.8h, v4.8h\n"
+ "smlal2 v25.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v12.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal2 v11.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
"smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
+ "ldr d31, [x23, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal v18.4s, v30.4h, v4.4h\n"
- "smlal v12.4s, v30.4h, v8.4h\n"
- "smlal2 v17.4s, v30.8h, v8.8h\n"
- "smlal v14.4s, v30.4h, v7.4h\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v16.4s, v30.4h, v7.4h\n"
"smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal2 v26.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
+ "smlal2 v17.4s, v30.8h, v5.8h\n"
+ "smlal2 v25.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x17]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal v12.4s, v29.4h, v3.4h\n"
- "smlal2 v17.4s, v29.8h, v3.8h\n"
- "smlal2 v10.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "smlal v24.4s, v29.4h, v0.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal v14.4s, v29.4h, v3.4h\n"
+ "smlal2 v11.4s, v29.8h, v3.8h\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x17]\n"
+ "smlal2 v25.4s, v28.8h, v2.8h\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal v24.4s, v31.4h, v3.4h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal v16.4s, v28.4h, v5.4h\n"
"smlal2 v9.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
+ "ldr d28, [x20, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v10.4s, v31.8h, v3.8h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal v18.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v7.8h\n"
- "smlal2 v26.4s, v29.8h, v6.8h\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
- "sqrdmulh v12.4s, v12.4s, v21.4s\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal v18.4s, v28.4h, v7.4h\n"
- "sqrdmulh v14.4s, v14.4s, v21.4s\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "smlal2 v10.4s, v28.8h, v8.8h\n"
- "smlal2 v26.4s, v28.8h, v7.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "and v29.16b, v12.16b, v24.16b\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
- "and v22.16b, v14.16b, v24.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "and v21.16b, v16.16b, v24.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v18.16b, v24.16b\n"
- "sqrdmulh v26.4s, v26.4s, v19.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v19.16b, v17.16b, v23.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v30.16b, v9.16b, v23.16b\n"
+ "smlal v14.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v14.4s, v14.4s, v22.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v25.4s, v30.8h, v5.8h\n"
+ "smlal v24.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v14.16b, v10.16b\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v11.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "smlal2 v17.4s, v29.8h, v7.8h\n"
+ "smlal2 v25.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v10.16b, v23.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v28.16b, v26.16b, v23.16b\n"
- "sqadd v12.4s, v12.4s, v29.4s\n"
+ "smlal v16.4s, v30.4h, v8.4h\n"
+ "smlal v24.4s, v28.4h, v8.4h\n"
+ "and v4.16b, v11.16b, v26.16b\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v22.4s\n"
+ "smlal2 v17.4s, v28.8h, v8.8h\n"
+ "smlal2 v25.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v22.4s\n"
+ "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v19.16b, v16.16b, v10.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "and v3.16b, v24.16b, v10.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "and v21.16b, v23.16b, v10.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v21.4s\n"
+ "and v27.16b, v9.16b, v26.16b\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v20.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v12.4s, v12.4s, v24.4s\n"
- "sqadd v17.4s, v17.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v24.4s\n"
- "sqadd v9.4s, v9.4s, v30.4s\n"
- "srshl v16.4s, v16.4s, v24.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v24.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "srshl v17.4s, v17.4s, v23.4s\n"
- "sqxtn v12.4h, v12.4s\n"
- "srshl v9.4s, v9.4s, v23.4s\n"
+ "and v5.16b, v17.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v4.16b, v25.16b, v26.16b\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v3.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v14.4s, v14.4s, v10.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v25.4s, v25.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v26.4s\n"
"sqxtn v14.4h, v14.4s\n"
- "srshl v10.4s, v10.4s, v23.4s\n"
+ "srshl v9.4s, v9.4s, v26.4s\n"
"sqxtn v16.4h, v16.4s\n"
- "srshl v26.4s, v26.4s, v23.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "sqxtn2 v12.8h, v17.4s\n"
- "sqxtn2 v14.8h, v9.4s\n"
- "sqxtn2 v16.8h, v10.4s\n"
- "sqxtn2 v18.8h, v26.4s\n"
- "sqadd v12.8h, v12.8h, v13.8h\n"
- "sqadd v14.8h, v14.8h, v13.8h\n"
- "sqadd v16.8h, v16.8h, v13.8h\n"
- "sqadd v18.8h, v18.8h, v13.8h\n"
- "smax v12.8h, v12.8h, v11.8h\n"
- "smax v14.8h, v14.8h, v11.8h\n"
- "smax v16.8h, v16.8h, v11.8h\n"
- "smax v18.8h, v18.8h, v11.8h\n"
- "smin v12.8h, v12.8h, v25.8h\n"
- "smin v14.8h, v14.8h, v25.8h\n"
- "smin v16.8h, v16.8h, v25.8h\n"
- "smin v18.8h, v18.8h, v25.8h\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "srshl v17.4s, v17.4s, v26.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v25.4s, v25.4s, v26.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v14.8h, v11.4s\n"
+ "sqxtn2 v16.8h, v9.4s\n"
+ "sqxtn2 v24.8h, v17.4s\n"
+ "sqxtn2 v23.8h, v25.4s\n"
+ "sqadd v14.8h, v14.8h, v20.8h\n"
+ "sqadd v16.8h, v16.8h, v20.8h\n"
+ "sqadd v24.8h, v24.8h, v20.8h\n"
+ "sqadd v23.8h, v23.8h, v20.8h\n"
+ "smax v14.8h, v14.8h, v15.8h\n"
+ "smax v16.8h, v16.8h, v15.8h\n"
+ "smax v24.8h, v24.8h, v15.8h\n"
+ "smax v23.8h, v23.8h, v15.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v16.8h, v16.8h, v13.8h\n"
+ "smin v24.8h, v24.8h, v13.8h\n"
+ "smin v23.8h, v23.8h, v13.8h\n"
"uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d12, [x10, x14]\n"
+ "str d14, [x11, x16]\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d14, [x9, x14]\n"
- "str d16, [x28, x14]\n"
- "str d18, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d16, [x10, x16]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x9, x16]\n"
+ "str d23, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 64f\n"
- "add x17, x17, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v12.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v17.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v17.s }[2], [x19]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v14.4s }, [x27], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v11.d }[0], [x27], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v11.s }[2], [x27]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v17.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v11.s }[0], [x27]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v12.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v12.s }[2], [x19]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v14.d }[0], [x27], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v14.s }[2], [x27]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v12.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v14.s }[0], [x27]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "mov v14.16b, v12.16b\n"
- "mov v9.16b, v17.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v16.16b, v12.16b\n"
- "mov v10.16b, v17.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
- "mov v18.16b, v12.16b\n"
- "mov v26.16b, v17.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x23, x22, [x13, #0x0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "ldr x19, [x13, #0x20]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v5.8h, v5.8b, v15.8b\n"
- "usubl v6.8h, v6.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v8.8h, v8.8b, v15.8b\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x23]\n"
- "ld1 { v30.b }[6], [x22]\n"
- "ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v24.16b, v14.16b\n"
+ "mov v17.16b, v11.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v23.16b, v14.16b\n"
+ "mov v25.16b, v11.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v31.s }[0], [x24], #0x4\n"
+ "ld1 { v30.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v31.h }[2], [x24], #0x2\n"
+ "ld1 { v30.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[6], [x24]\n"
+ "ld1 { v30.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x23]\n"
- "ld1 { v30.b }[4], [x22]\n"
- "ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[4], [x24]\n"
+ "ld1 { v30.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x23]\n"
- "ld1 { v30.b }[2], [x22]\n"
- "ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v31.h }[0], [x24], #0x2\n"
+ "ld1 { v30.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[2], [x24]\n"
+ "ld1 { v30.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x23]\n"
- "ld1 { v30.b }[0], [x22]\n"
- "ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[0], [x24]\n"
+ "ld1 { v30.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v12.4s, v31.4h, v4.4h\n"
- "smlal2 v17.4s, v31.8h, v4.8h\n"
- "ldr x21, [x13, #0x28]\n"
- "smlal v14.4s, v31.4h, v3.4h\n"
+ "smlal v14.4s, v31.4h, v4.4h\n"
+ "smlal2 v11.4s, v31.8h, v4.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
"smlal2 v9.4s, v31.8h, v3.8h\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "add x21, x21, x15\n"
+ "add x22, x22, x17\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "smlal2 v26.4s, v31.8h, v0.8h\n"
+ "smlal v24.4s, v31.4h, v1.4h\n"
+ "smlal2 v17.4s, v31.8h, v1.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v25.4s, v31.8h, v0.8h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v12.4s, v30.4h, v0.4h\n"
- "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal2 v11.4s, v30.8h, v0.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal v16.4s, v29.4h, v2.4h\n"
"smlal2 v9.4s, v29.8h, v2.8h\n"
- "smlal v12.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
"smlal2 v9.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "tbz x8, #2, 13f\n"
- "ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 12f\n"
- "ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
- "ld1 { v31.b }[6], [x21]\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v17.4s, v28.8h, v2.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v25.4s, v28.8h, v1.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[6], [x22]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
- "ld1 { v31.b }[4], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[4], [x22]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 14f\n"
- "ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
- "ld1 { v31.b }[2], [x21]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[2], [x22]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
- "ld1 { v31.b }[0], [x21]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[0], [x22]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v10.4s, v31.8h, v6.8h\n"
- "ldr x20, [x13, #0x30]\n"
- "smlal v12.4s, v27.4h, v7.4h\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal v24.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v14.4s, v27.4h, v7.4h\n"
+ "smlal2 v11.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v16.4s, v27.4h, v6.4h\n"
"smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v18.4s, v27.4h, v3.4h\n"
- "smlal2 v26.4s, v27.8h, v3.8h\n"
- "tbz x8, #2, 17f\n"
+ "smlal v24.4s, v27.4h, v4.4h\n"
+ "smlal2 v17.4s, v27.8h, v4.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v25.4s, v27.8h, v3.8h\n"
+ "tbz x7, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 16f\n"
+ "tbz x7, #1, 16f\n"
"ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x7, #0, 19f\n"
"ld1 { v29.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x7, #0, 19f\n"
"ld1 { v29.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 18f\n"
+ "tbz x7, #1, 18f\n"
"ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x7, #0, 19f\n"
"ld1 { v29.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x7, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x26, [x13, #0x38]\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v26.4s, v29.8h, v8.8h\n"
- "add x26, x26, x15\n"
- "tbz x8, #2, 21f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x8, #1, 20f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "ldr x21, [x15, #0x38]\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v25.4s, v29.8h, v8.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v28.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v28.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x8, #1, 22f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x7, #1, 22f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v28.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v28.b }[0], [x21]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x25, [x13, #0x40]\n"
- "smlal v12.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
+ "ldr x26, [x15, #0x40]\n"
+ "smlal v14.4s, v28.4h, v1.4h\n"
+ "smlal2 v11.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
"smlal2 v9.4s, v28.8h, v0.8h\n"
- "add x25, x25, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v31.s }[0], [x25], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v31.h }[2], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[6], [x25]\n"
+ "add x26, x26, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v31.b }[6], [x26]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[4], [x25]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v31.b }[4], [x26]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v31.h }[0], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[2], [x25]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v31.b }[2], [x26]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[0], [x25]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v31.b }[0], [x26]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr x19, [x13, #0x48]\n"
- "smlal v12.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal2 v11.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
"smlal2 v9.4s, v31.8h, v1.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 29f\n"
- "ld1 { v30.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 28f\n"
- "ld1 { v30.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[6], [x19]\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[4], [x19]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x8, #1, 30f\n"
- "ld1 { v30.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[2], [x19]\n"
+ "tbz x7, #1, 30f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[0], [x19]\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr x24, [x13, #0x50]\n"
- "smlal v12.4s, v30.4h, v8.4h\n"
- "smlal2 v17.4s, v30.8h, v8.8h\n"
- "smlal v14.4s, v30.4h, v7.4h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v16.4s, v30.4h, v7.4h\n"
"smlal2 v9.4s, v30.8h, v7.8h\n"
- "add x24, x24, x15\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v18.4s, v30.4h, v4.4h\n"
- "smlal2 v26.4s, v30.8h, v4.8h\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "add x25, x25, x17\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal2 v17.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v25.4s, v30.8h, v4.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x23, [x13, #0x58]\n"
- "smlal v12.4s, v29.4h, v3.4h\n"
- "smlal2 v17.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v10.4s, v29.8h, v0.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "smlal v14.4s, v29.4h, v3.4h\n"
+ "smlal2 v11.4s, v29.8h, v3.8h\n"
+ "smlal v24.4s, v29.4h, v0.4h\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "add x24, x24, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x22, [x13, #0x60]\n"
- "smlal v14.4s, v28.4h, v5.4h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v16.4s, v28.4h, v5.4h\n"
"smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v26.4s, v28.8h, v2.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v25.4s, v28.8h, v2.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v31.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v31.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v31.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v31.b }[0], [x23]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr x21, [x13, #0x68]\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v10.4s, v31.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "smlal v14.4s, v31.4h, v6.4h\n"
+ "smlal2 v11.4s, v31.8h, v6.8h\n"
+ "smlal v24.4s, v31.4h, v3.4h\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x7, #1, 46f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr x20, [x13, #0x70]\n"
- "smlal v14.4s, v30.4h, v8.4h\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v16.4s, v30.4h, v8.4h\n"
"smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal2 v26.4s, v30.8h, v5.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v25.4s, v30.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v29.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v29.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v29.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v29.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x19, [x13, #0x78]\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v10.4s, v29.8h, v7.8h\n"
- "smlal v18.4s, v29.4h, v6.4h\n"
- "smlal2 v26.4s, v29.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v28.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v28.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[6], [x19]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v24.4s, v29.4h, v7.4h\n"
+ "smlal2 v17.4s, v29.8h, v7.8h\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v25.4s, v29.8h, v6.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[4], [x19]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v28.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[2], [x19]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[0], [x19]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v10.4s, v28.8h, v8.8h\n"
- "smlal v18.4s, v28.4h, v7.4h\n"
- "smlal2 v26.4s, v28.8h, v7.8h\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v21.4s }, [x12], #0x10\n"
- "ld1 { v24.4s }, [x11], #0x10\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v19.d }[0], [x12], #0x8\n"
- "ld1 { v23.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v19.s }[2], [x12]\n"
- "ld1 { v23.s }[2], [x11]\n"
+ "smlal v24.4s, v28.4h, v8.4h\n"
+ "smlal2 v17.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v25.4s, v28.8h, v7.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v22.4s }, [x13], #0x10\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v26.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v19.s }[0], [x12]\n"
- "ld1 { v23.s }[0], [x11]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v26.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v21.d }[0], [x12], #0x8\n"
- "ld1 { v24.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[2], [x12]\n"
- "ld1 { v24.s }[2], [x11]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v22.d }[0], [x13], #0x8\n"
+ "ld1 { v10.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v22.s }[2], [x13]\n"
+ "ld1 { v10.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[0], [x12]\n"
- "ld1 { v24.s }[0], [x11]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v22.s }[0], [x13]\n"
+ "ld1 { v10.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v12.4s, v12.4s, v21.4s\n"
- "sqrdmulh v14.4s, v14.4s, v21.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v29.16b, v12.16b, v24.16b\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
- "and v22.16b, v14.16b, v24.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "and v21.16b, v16.16b, v24.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v18.16b, v24.16b\n"
- "sqrdmulh v26.4s, v26.4s, v19.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v19.16b, v17.16b, v23.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v30.16b, v9.16b, v23.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v22.4s\n"
+ "and v21.16b, v14.16b, v10.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v10.16b, v23.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v28.16b, v26.16b, v23.16b\n"
- "sqadd v12.4s, v12.4s, v29.4s\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v4.16b, v11.16b, v26.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v22.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v22.4s\n"
+ "sqadd v14.4s, v14.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v19.16b, v16.16b, v10.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "and v3.16b, v24.16b, v10.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "and v21.16b, v23.16b, v10.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqadd v11.4s, v11.4s, v4.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v21.4s\n"
+ "and v27.16b, v9.16b, v26.16b\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v20.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v12.4s, v12.4s, v24.4s\n"
- "sqadd v17.4s, v17.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v24.4s\n"
- "sqadd v9.4s, v9.4s, v30.4s\n"
- "srshl v16.4s, v16.4s, v24.4s\n"
- "sqadd v10.4s, v10.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v24.4s\n"
- "sqadd v26.4s, v26.4s, v28.4s\n"
- "srshl v17.4s, v17.4s, v23.4s\n"
- "sqxtn v12.4h, v12.4s\n"
- "srshl v9.4s, v9.4s, v23.4s\n"
+ "and v5.16b, v17.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v4.16b, v25.16b, v26.16b\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v3.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v14.4s, v14.4s, v10.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v25.4s, v25.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v26.4s\n"
"sqxtn v14.4h, v14.4s\n"
- "srshl v10.4s, v10.4s, v23.4s\n"
+ "srshl v9.4s, v9.4s, v26.4s\n"
"sqxtn v16.4h, v16.4s\n"
- "srshl v26.4s, v26.4s, v23.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "sqxtn2 v12.8h, v17.4s\n"
- "sqxtn2 v14.8h, v9.4s\n"
- "sqxtn2 v16.8h, v10.4s\n"
- "sqxtn2 v18.8h, v26.4s\n"
- "sqadd v12.8h, v12.8h, v13.8h\n"
- "sqadd v14.8h, v14.8h, v13.8h\n"
- "sqadd v16.8h, v16.8h, v13.8h\n"
- "sqadd v18.8h, v18.8h, v13.8h\n"
- "smax v12.8h, v12.8h, v11.8h\n"
- "smax v14.8h, v14.8h, v11.8h\n"
- "smax v16.8h, v16.8h, v11.8h\n"
- "smax v18.8h, v18.8h, v11.8h\n"
- "smin v12.8h, v12.8h, v25.8h\n"
- "smin v14.8h, v14.8h, v25.8h\n"
- "smin v16.8h, v16.8h, v25.8h\n"
- "smin v18.8h, v18.8h, v25.8h\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "srshl v17.4s, v17.4s, v26.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v25.4s, v25.4s, v26.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v14.8h, v11.4s\n"
+ "sqxtn2 v16.8h, v9.4s\n"
+ "sqxtn2 v24.8h, v17.4s\n"
+ "sqxtn2 v23.8h, v25.4s\n"
+ "sqadd v14.8h, v14.8h, v20.8h\n"
+ "sqadd v16.8h, v16.8h, v20.8h\n"
+ "sqadd v24.8h, v24.8h, v20.8h\n"
+ "sqadd v23.8h, v23.8h, v20.8h\n"
+ "smax v14.8h, v14.8h, v15.8h\n"
+ "smax v16.8h, v16.8h, v15.8h\n"
+ "smax v24.8h, v24.8h, v15.8h\n"
+ "smax v23.8h, v23.8h, v15.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v16.8h, v16.8h, v13.8h\n"
+ "smin v24.8h, v24.8h, v13.8h\n"
+ "smin v23.8h, v23.8h, v13.8h\n"
"uzp1 v14.16b, v14.16b, v14.16b\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "tbz x8, #2, 61f\n"
- "st1 { v12.s }[0], [x10], #0x4\n"
- "st1 { v14.s }[0], [x9], #0x4\n"
- "st1 { v16.s }[0], [x28], #0x4\n"
- "st1 { v18.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 60f\n"
- "st1 { v12.h }[2], [x10], #0x2\n"
- "st1 { v14.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x28], #0x2\n"
- "st1 { v18.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v12.b }[6], [x10], #0x1\n"
- "st1 { v14.b }[6], [x9], #0x1\n"
- "st1 { v16.b }[6], [x28], #0x1\n"
- "st1 { v18.b }[6], [x27], #0x1\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x7, #2, 61f\n"
+ "st1 { v14.s }[0], [x11], #0x4\n"
+ "st1 { v16.s }[0], [x10], #0x4\n"
+ "st1 { v24.s }[0], [x9], #0x4\n"
+ "st1 { v23.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "st1 { v14.h }[2], [x11], #0x2\n"
+ "st1 { v16.h }[2], [x10], #0x2\n"
+ "st1 { v24.h }[2], [x9], #0x2\n"
+ "st1 { v23.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v14.b }[6], [x11], #0x1\n"
+ "st1 { v16.b }[6], [x10], #0x1\n"
+ "st1 { v24.b }[6], [x9], #0x1\n"
+ "st1 { v23.b }[6], [x28], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v12.b }[4], [x10], #0x1\n"
- "st1 { v14.b }[4], [x9], #0x1\n"
- "st1 { v16.b }[4], [x28], #0x1\n"
- "st1 { v18.b }[4], [x27], #0x1\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v14.b }[4], [x11], #0x1\n"
+ "st1 { v16.b }[4], [x10], #0x1\n"
+ "st1 { v24.b }[4], [x9], #0x1\n"
+ "st1 { v23.b }[4], [x28], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "st1 { v12.h }[0], [x10], #0x2\n"
- "st1 { v14.h }[0], [x9], #0x2\n"
- "st1 { v16.h }[0], [x28], #0x2\n"
- "st1 { v18.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v12.b }[2], [x10], #0x1\n"
- "st1 { v14.b }[2], [x9], #0x1\n"
- "st1 { v16.b }[2], [x28], #0x1\n"
- "st1 { v18.b }[2], [x27], #0x1\n"
+ "tbz x7, #1, 62f\n"
+ "st1 { v14.h }[0], [x11], #0x2\n"
+ "st1 { v16.h }[0], [x10], #0x2\n"
+ "st1 { v24.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v14.b }[2], [x11], #0x1\n"
+ "st1 { v16.b }[2], [x10], #0x1\n"
+ "st1 { v24.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v12.b }[0], [x10], #0x1\n"
- "st1 { v14.b }[0], [x9], #0x1\n"
- "st1 { v16.b }[0], [x28], #0x1\n"
- "st1 { v18.b }[0], [x27], #0x1\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v14.b }[0], [x11], #0x1\n"
+ "st1 { v16.b }[0], [x10], #0x1\n"
+ "st1 { v24.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 42ff502b0f..a1e5c669b7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,324 +100,324 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x19, %[offsetof_Requantize32_c_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "ld1r { v16.16b }, [x23]\n"
- "ld1r { v12.8h }, [x22]\n"
- "lsr x16, x8, #0x3\n"
- "mov x15, #0x0\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v19.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v12.8h }, [x21]\n"
"ld1r { v14.8h }, [x20]\n"
- "ld1r { v21.8h }, [x19]\n"
- "mov x14, #0x0\n"
- "add x13, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v13.16b, v15.16b\n"
- "ldr q18, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v17.16b, v18.16b\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x17, #0x0\n"
+ "ld1r { v23.8h }, [x20]\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v0.8h, v0.8b, v19.8b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v2.8h, v2.8b, v19.8b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "usubl v3.8h, v3.8b, v19.8b\n"
+ "usubl v4.8h, v4.8b, v19.8b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v6.8h, v6.8b, v19.8b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v19.8b\n"
+ "usubl v8.8h, v8.8b, v19.8b\n"
+ "ldr q15, [x22, #0x0]\n"
+ "ldr q13, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "str x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
"mov v11.16b, v15.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v10.16b, v18.16b\n"
- "mov v23.16b, v15.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v9.16b, v18.16b\n"
- "usubl v0.8h, v0.8b, v16.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v1.8h, v1.8b, v16.8b\n"
- "usubl v2.8h, v2.8b, v16.8b\n"
- "ldp x26, x25, [x13, #0x0]\n"
- "ldp x24, x23, [x13, #0x10]\n"
- "usubl v3.8h, v3.8b, v16.8b\n"
- "usubl v4.8h, v4.8b, v16.8b\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldp x20, x19, [x13, #0x30]\n"
- "usubl v5.8h, v5.8b, v16.8b\n"
- "usubl v6.8h, v6.8b, v16.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
- "usubl v7.8h, v7.8b, v16.8b\n"
- "usubl v8.8h, v8.8b, v16.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr d31, [x27, x17]\n"
+ "ldr d30, [x26, x17]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr d28, [x24, x17]\n"
"ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d27, [x23, x17]\n"
+ "ldr d26, [x22, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr d24, [x20, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
"ushll v25.8h, v25.8b, #0x0\n"
"ushll v24.8h, v24.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
+ "ldr q18, [x13, #0x0]\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v18.4s, v31.8h, v8.8h\n"
- "ldr x24, [x13, #0x40]\n"
- "ldr x23, [x13, #0x48]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr x21, [x13, #0x50]\n"
- "ldr x19, [x13, #0x58]\n"
+ "smlal2 v13.4s, v31.8h, v8.8h\n"
+ "ldr x23, [x15, #0x40]\n"
+ "smlal v17.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x22, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x50]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v18.4s, v30.8h, v0.8h\n"
- "ldr x22, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v13.4s, v30.8h, v0.8h\n"
+ "ldr q21, [x12, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v18.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v13.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v18.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v13.4s, v24.4h, v0.4h\n"
- "smlal2 v17.4s, v24.8h, v0.8h\n"
- "ldr x21, [x13, #0x80]\n"
- "ldr x19, [x13, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v18.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v13.4s, v29.4h, v4.4h\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr d29, [x19, x15]\n"
+ "smlal2 v13.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v17.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "ldr x19, [x13, #0x70]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x22, [x15, #0x88]\n"
"smlal v11.4s, v31.4h, v2.4h\n"
"smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr x24, [x13, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v18.4s, v27.8h, v5.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x23, [x13, #0x90]\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v13.4s, v27.8h, v5.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v17.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
"smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x22, [x13, #0xa8]\n"
- "ldr x19, [x13, #0xa0]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x23, [x15, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "smlal2 v9.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v4.4h\n"
"smlal v11.4s, v25.4h, v0.4h\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr x20, [x13, #0xb8]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x15, #0xa0]\n"
"smlal2 v10.4s, v25.8h, v0.8h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x13, #0xc0]\n"
- "ldr q22, [x12, #0x0]\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v13.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
"smlal v11.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
"ushll v25.8h, v25.8b, #0x0\n"
"smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v9.4s, v28.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v9.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q31, [x11, #0x0]\n"
- "ldr q19, [x12, #0x10]\n"
- "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v13.4s, v24.8h, v7.8h\n"
"smlal v11.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v22.4s\n"
- "ldr q30, [x11, #0x10]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
"smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v9.4s, v26.8h, v5.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
"smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v4.16b, v15.16b, v31.16b\n"
- "add x17, x17, #0x48\n"
- "smlal v13.4s, v28.4h, v7.4h\n"
- "smlal2 v17.4s, v28.8h, v7.8h\n"
- "sqrdmulh v18.4s, v18.4s, v19.4s\n"
- "subs x16, x16, #0x1\n"
+ "smlal v9.4s, v24.4h, v3.4h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "add x14, x14, #0x48\n"
+ "smlal v17.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "and v2.16b, v15.16b, v21.16b\n"
+ "subs x8, x8, #0x1\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
"ushll v25.8h, v25.8b, #0x0\n"
"smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "add x12, x12, #0x20\n"
- "smlal v13.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal v9.4s, v26.4h, v7.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v30.4s\n"
+ "add x13, x13, #0x20\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
"smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v9.4s, v26.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v22.4s\n"
- "add x15, x15, #0x8\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add x17, x17, #0x8\n"
"smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v1.16b, v13.16b, v31.16b\n"
- "add x11, x11, #0x20\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "and v16.16b, v13.16b, v31.16b\n"
+ "add x12, x12, #0x20\n"
"smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v9.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
"smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v22.4s\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
"smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v9.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "and v22.16b, v11.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v23.16b, v31.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "and v19.16b, v18.16b, v30.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v27.16b, v17.16b, v30.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v25.16b, v10.16b, v30.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v0.16b, v9.16b, v30.16b\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v1.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v22.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v31.4s\n"
- "sqadd v18.4s, v18.4s, v19.4s\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v17.16b, v21.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v28.16b, v11.16b, v21.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
+ "and v2.16b, v9.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v20.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v31.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v16.16b, v22.16b, v31.16b\n"
+ "sqadd v17.4s, v17.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v17.4s, v17.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v9.4s, v9.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v31.4s\n"
- "sqadd v17.4s, v17.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v31.4s\n"
- "sqadd v10.4s, v10.4s, v25.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v9.4s, v9.4s, v30.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v18.4s\n"
- "sqxtn2 v13.8h, v17.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v15.8h, v13.4s\n"
+ "sqxtn2 v17.8h, v20.4s\n"
"sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v23.8h, v9.4s\n"
+ "sqxtn2 v9.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v17.8h, v17.8h, v12.8h\n"
"sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v23.8h, v23.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
"smax v15.8h, v15.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v17.8h, v17.8h, v14.8h\n"
"smax v11.8h, v11.8h, v14.8h\n"
- "smax v23.8h, v23.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v21.8h\n"
- "smin v13.8h, v13.8h, v21.8h\n"
- "smin v11.8h, v11.8h, v21.8h\n"
- "smin v23.8h, v23.8h, v21.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v23.8h\n"
+ "smin v17.8h, v17.8h, v23.8h\n"
+ "smin v11.8h, v11.8h, v23.8h\n"
+ "smin v9.8h, v9.8h, v23.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x11, x16]\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d13, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d11, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q18, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v13.16b, v15.16b\n"
- "mov v17.16b, v18.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d17, [x10, x16]\n"
+ "str d11, [x9, x16]\n"
+ "str d9, [x28, x16]\n"
+ "ldr q15, [x22, #0x0]\n"
+ "ldr q13, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"mov v11.16b, v15.16b\n"
- "mov v10.16b, v18.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v15.16b\n"
- "mov v9.16b, v18.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "usubl v0.8h, v0.8b, v16.8b\n"
- "usubl v1.8h, v1.8b, v16.8b\n"
- "ldp x26, x25, [x13, #0x0]\n"
- "ldp x24, x23, [x13, #0x10]\n"
- "usubl v2.8h, v2.8b, v16.8b\n"
- "usubl v3.8h, v3.8b, v16.8b\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "ldp x20, x19, [x13, #0x30]\n"
- "usubl v4.8h, v4.8b, v16.8b\n"
- "usubl v5.8h, v5.8b, v16.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
- "usubl v6.8h, v6.8b, v16.8b\n"
- "usubl v7.8h, v7.8b, v16.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
- "usubl v8.8h, v8.8b, v16.8b\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v0.8h, v0.8b, v19.8b\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v2.8h, v2.8b, v19.8b\n"
+ "usubl v3.8h, v3.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d31, [x27, x17]\n"
+ "usubl v4.8h, v4.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d30, [x26, x17]\n"
+ "ldr d29, [x25, x17]\n"
+ "usubl v6.8h, v6.8b, v19.8b\n"
+ "usubl v7.8h, v7.8b, v19.8b\n"
+ "ldr d28, [x24, x17]\n"
+ "ldr d27, [x23, x17]\n"
+ "usubl v8.8h, v8.8b, v19.8b\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr d25, [x21, x17]\n"
"ushll v30.8h, v30.8b, #0x0\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d24, [x20, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
@@ -425,967 +425,967 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ushll v24.8h, v24.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
+ "ldr q18, [x13, #0x0]\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v18.4s, v31.8h, v8.8h\n"
- "ldr x24, [x13, #0x40]\n"
- "ldr x23, [x13, #0x48]\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr x21, [x13, #0x50]\n"
- "ldr x19, [x13, #0x58]\n"
+ "smlal2 v13.4s, v31.8h, v8.8h\n"
+ "ldr x23, [x15, #0x40]\n"
+ "smlal v17.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x22, [x15, #0x48]\n"
+ "ldr x21, [x15, #0x50]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v18.4s, v30.8h, v0.8h\n"
- "ldr x22, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v13.4s, v30.8h, v0.8h\n"
+ "ldr q21, [x12, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"ushll v28.8h, v28.8b, #0x0\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v18.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v13.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v18.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v13.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ldr x20, [x15, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v13.4s, v24.4h, v0.4h\n"
- "smlal2 v17.4s, v24.8h, v0.8h\n"
- "ldr x21, [x13, #0x80]\n"
- "ldr x19, [x13, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v18.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v13.4s, v29.4h, v4.4h\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr d29, [x19, x15]\n"
+ "smlal2 v13.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v17.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "ldr x19, [x13, #0x70]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x22, [x15, #0x88]\n"
"smlal v11.4s, v31.4h, v2.4h\n"
"smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr x24, [x13, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v18.4s, v27.8h, v5.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "ldr x23, [x13, #0x90]\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v13.4s, v27.8h, v5.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v17.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
"smlal v11.4s, v26.4h, v3.4h\n"
- "ldr x22, [x13, #0xa8]\n"
- "ldr x19, [x13, #0xa0]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x23, [x15, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "smlal2 v9.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v4.4h\n"
"smlal v11.4s, v25.4h, v0.4h\n"
- "ldr x21, [x13, #0xb0]\n"
- "ldr x20, [x13, #0xb8]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x15, #0xa0]\n"
"smlal2 v10.4s, v25.8h, v0.8h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x13, #0xc0]\n"
- "ldr q22, [x12, #0x0]\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v13.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
"smlal v11.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
"ushll v25.8h, v25.8b, #0x0\n"
"smlal2 v10.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v9.4s, v28.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v9.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q31, [x11, #0x0]\n"
- "ldr q19, [x12, #0x10]\n"
- "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v13.4s, v24.8h, v7.8h\n"
"smlal v11.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v22.4s\n"
- "ldr q30, [x11, #0x10]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
"smlal2 v10.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v9.4s, v26.8h, v5.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "tst x7, #0x7\n"
"smlal v11.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v4.16b, v15.16b, v31.16b\n"
- "tst x8, #0x7\n"
- "smlal v13.4s, v28.4h, v7.4h\n"
- "smlal2 v17.4s, v28.8h, v7.8h\n"
- "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "smlal v9.4s, v24.4h, v3.4h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "add x13, x13, #0x20\n"
+ "smlal v17.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "and v2.16b, v15.16b, v21.16b\n"
"add x12, x12, #0x20\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
"ushll v25.8h, v25.8b, #0x0\n"
"smlal v11.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "add x11, x11, #0x20\n"
- "smlal v13.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal v9.4s, v26.4h, v7.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v30.4s\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"ushll v29.8h, v29.8b, #0x0\n"
"smlal2 v10.4s, v27.8h, v7.8h\n"
- "smlal2 v9.4s, v26.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v22.4s\n"
- "add x15, x15, #0x8\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add x17, x17, #0x8\n"
"smlal v11.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v1.16b, v13.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "and v16.16b, v13.16b, v31.16b\n"
"smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal2 v9.4s, v25.8h, v6.8h\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
"smlal v11.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v11.4s, v11.4s, v22.4s\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
"smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal2 v9.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "and v22.16b, v11.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v23.16b, v31.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "and v19.16b, v18.16b, v30.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v27.16b, v17.16b, v30.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v25.16b, v10.16b, v30.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v0.16b, v9.16b, v30.16b\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v1.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v22.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v31.4s\n"
- "sqadd v18.4s, v18.4s, v19.4s\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v17.16b, v21.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v28.16b, v11.16b, v21.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
+ "and v2.16b, v9.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v20.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v31.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v16.16b, v22.16b, v31.16b\n"
+ "sqadd v17.4s, v17.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v17.4s, v17.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v9.4s, v9.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v31.4s\n"
- "sqadd v17.4s, v17.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v31.4s\n"
- "sqadd v10.4s, v10.4s, v25.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v9.4s, v9.4s, v30.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v18.4s\n"
- "sqxtn2 v13.8h, v17.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v15.8h, v13.4s\n"
+ "sqxtn2 v17.8h, v20.4s\n"
"sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v23.8h, v9.4s\n"
+ "sqxtn2 v9.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v17.8h, v17.8h, v12.8h\n"
"sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v23.8h, v23.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
"smax v15.8h, v15.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v17.8h, v17.8h, v14.8h\n"
"smax v11.8h, v11.8h, v14.8h\n"
- "smax v23.8h, v23.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v21.8h\n"
- "smin v13.8h, v13.8h, v21.8h\n"
- "smin v11.8h, v11.8h, v21.8h\n"
- "smin v23.8h, v23.8h, v21.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v23.8h\n"
+ "smin v17.8h, v17.8h, v23.8h\n"
+ "smin v11.8h, v11.8h, v23.8h\n"
+ "smin v9.8h, v9.8h, v23.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x11, x16]\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d13, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d11, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d17, [x10, x16]\n"
+ "str d11, [x9, x16]\n"
+ "str d9, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 88f\n"
- "add x17, x17, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v18.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v18.s }[2], [x19]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v15.4s }, [x22], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v13.d }[0], [x22], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v13.s }[2], [x22]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v18.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v13.s }[0], [x22]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v15.d }[0], [x22], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[2], [x22]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[0], [x22]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "mov v13.16b, v15.16b\n"
- "mov v17.16b, v18.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"mov v11.16b, v15.16b\n"
- "mov v10.16b, v18.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
- "mov v23.16b, v15.16b\n"
- "mov v9.16b, v18.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
- "usubl v0.8h, v0.8b, v16.8b\n"
- "usubl v1.8h, v1.8b, v16.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x26, x25, [x13, #0x0]\n"
- "usubl v2.8h, v2.8b, v16.8b\n"
- "usubl v3.8h, v3.8b, v16.8b\n"
- "ldp x24, x23, [x13, #0x10]\n"
- "ldp x22, x21, [x13, #0x20]\n"
- "usubl v4.8h, v4.8b, v16.8b\n"
- "usubl v5.8h, v5.8b, v16.8b\n"
- "ldp x20, x19, [x13, #0x30]\n"
- "usubl v6.8h, v6.8b, v16.8b\n"
- "usubl v7.8h, v7.8b, v16.8b\n"
- "usubl v8.8h, v8.8b, v16.8b\n"
- "add x26, x26, x15\n"
- "add x25, x25, x15\n"
- "add x24, x24, x15\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "ld1 { v30.s }[0], [x25], #0x4\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x26]\n"
- "ld1 { v30.b }[6], [x25]\n"
- "ld1 { v29.b }[6], [x24]\n"
- "ld1 { v28.b }[6], [x23]\n"
- "ld1 { v27.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v0.8h, v0.8b, v19.8b\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v2.8h, v2.8b, v19.8b\n"
+ "usubl v3.8h, v3.8b, v19.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v4.8h, v4.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "usubl v6.8h, v6.8b, v19.8b\n"
+ "usubl v7.8h, v7.8b, v19.8b\n"
+ "usubl v8.8h, v8.8b, v19.8b\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x26]\n"
- "ld1 { v30.b }[4], [x25]\n"
- "ld1 { v29.b }[4], [x24]\n"
- "ld1 { v28.b }[4], [x23]\n"
- "ld1 { v27.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "ld1 { v30.h }[0], [x25], #0x2\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x26]\n"
- "ld1 { v30.b }[2], [x25]\n"
- "ld1 { v29.b }[2], [x24]\n"
- "ld1 { v28.b }[2], [x23]\n"
- "ld1 { v27.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x26]\n"
- "ld1 { v30.b }[0], [x25]\n"
- "ld1 { v29.b }[0], [x24]\n"
- "ld1 { v28.b }[0], [x23]\n"
- "ld1 { v27.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v25.b }[0], [x20]\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v18.4s, v31.8h, v8.8h\n"
- "ldr x24, [x13, #0x40]\n"
+ "smlal2 v13.4s, v31.8h, v8.8h\n"
+ "ldr x23, [x15, #0x40]\n"
"ushll v30.8h, v30.8b, #0x0\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v18.4s, v30.8h, v0.8h\n"
- "add x24, x24, x15\n"
+ "smlal2 v13.4s, v30.8h, v0.8h\n"
+ "add x23, x23, x17\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "smlal v17.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v18.4s, v29.8h, v1.8h\n"
+ "smlal2 v13.4s, v29.8h, v1.8h\n"
"ushll v28.8h, v28.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v18.4s, v26.8h, v3.8h\n"
+ "smlal2 v13.4s, v26.8h, v3.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal v13.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v18.4s, v25.8h, v4.8h\n"
+ "smlal2 v13.4s, v25.8h, v4.8h\n"
"ushll v24.8h, v24.8b, #0x0\n"
"smlal v11.4s, v31.4h, v2.4h\n"
"smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v9.4s, v31.8h, v0.8h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "smlal v13.4s, v24.4h, v0.4h\n"
- "smlal2 v17.4s, v24.8h, v0.8h\n"
- "tbz x8, #2, 13f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 12f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v29.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v29.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v29.b }[6], [x23]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v29.b }[4], [x23]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 14f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x7, #1, 14f\n"
+ "ld1 { v29.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v29.b }[2], [x23]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v29.b }[0], [x23]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x23, [x13, #0x48]\n"
- "smlal v13.4s, v29.4h, v4.4h\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 17f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 16f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v17.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x8, #1, 18f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x21, [x13, #0x50]\n"
- "smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v17.4s, v28.8h, v5.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 21f\n"
+ "ldr x21, [x15, #0x50]\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 20f\n"
+ "tbz x7, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x8, #1, 22f\n"
+ "tbz x7, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x19, [x13, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v18.4s, v27.8h, v5.8h\n"
- "smlal v13.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v26.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v26.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[6], [x19]\n"
+ "smlal2 v13.4s, v27.8h, v5.8h\n"
+ "smlal v17.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[4], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v26.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[2], [x19]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[0], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x13, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"smlal v11.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 29f\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 28f\n"
+ "tbz x7, #1, 28f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 30f\n"
+ "tbz x7, #1, 30f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
"ushll v25.8h, v25.8b, #0x0\n"
- "ldr x19, [x13, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "smlal2 v13.4s, v25.8h, v6.8h\n"
"smlal v11.4s, v25.4h, v0.4h\n"
"smlal2 v10.4s, v25.8h, v0.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x19, [x13, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"smlal v11.4s, v29.4h, v4.4h\n"
"smlal2 v10.4s, v29.8h, v4.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
"ushll v24.8h, v24.8b, #0x0\n"
- "ldr x22, [x13, #0x78]\n"
+ "ldr x23, [x15, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "smlal2 v13.4s, v24.8h, v7.8h\n"
"smlal v11.4s, v24.4h, v1.4h\n"
"smlal2 v10.4s, v24.8h, v1.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x21, [x13, #0x80]\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v9.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 45f\n"
"ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
+ "tbz x7, #1, 44f\n"
"ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
+ "tbz x7, #1, 46f\n"
"ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x20, [x13, #0x88]\n"
- "smlal v13.4s, v28.4h, v7.4h\n"
- "smlal2 v17.4s, v28.8h, v7.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v9.4s, v28.8h, v1.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v17.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x23, [x13, #0x90]\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
- "smlal2 v9.4s, v26.8h, v5.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[6], [x23]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "smlal v9.4s, v26.4h, v5.4h\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[6], [x21]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[4], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[4], [x21]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[2], [x23]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[2], [x21]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[0], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[0], [x21]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
"ushll v25.8h, v25.8b, #0x0\n"
- "ldr x24, [x13, #0x98]\n"
+ "ldr x24, [x15, #0x98]\n"
"smlal v11.4s, v25.4h, v6.4h\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "add x24, x24, x15\n"
- "tbz x8, #2, 57f\n"
+ "add x24, x24, x17\n"
+ "tbz x7, #2, 57f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 56f\n"
+ "tbz x7, #1, 56f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
+ "tbz x7, #0, 59f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
+ "tbz x7, #0, 59f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x8, #1, 58f\n"
+ "tbz x7, #1, 58f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
+ "tbz x7, #0, 59f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
+ "tbz x7, #0, 59f\n"
"ld1 { v29.b }[0], [x24]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr x19, [x13, #0xa0]\n"
- "smlal v13.4s, v29.4h, v8.4h\n"
- "smlal2 v17.4s, v29.8h, v8.8h\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 61f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 60f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x7, #1, 62f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x22, [x13, #0xa8]\n"
+ "ldr x23, [x15, #0xa8]\n"
"smlal v11.4s, v27.4h, v7.4h\n"
"smlal2 v10.4s, v27.8h, v7.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 65f\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 64f\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[6], [x22]\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[6], [x23]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[4], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[4], [x23]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 66f\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[2], [x22]\n"
+ "tbz x7, #1, 66f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[2], [x23]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[0], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[0], [x23]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
"ushll v24.8h, v24.8b, #0x0\n"
- "ldr x21, [x13, #0xb0]\n"
+ "ldr x22, [x15, #0xb0]\n"
"smlal v11.4s, v24.4h, v5.4h\n"
"smlal2 v10.4s, v24.8h, v5.8h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 69f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 68f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "smlal v9.4s, v24.4h, v3.4h\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x8, #1, 70f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x7, #1, 70f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
"ushll v26.8h, v26.8b, #0x0\n"
- "ldr x20, [x13, #0xb8]\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "smlal2 v9.4s, v26.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 73f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 72f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal v9.4s, v26.4h, v7.4h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[6], [x21]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[4], [x21]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x8, #1, 74f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x7, #1, 74f\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[2], [x21]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[0], [x21]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
"ushll v25.8h, v25.8b, #0x0\n"
- "ldr x19, [x13, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"smlal v11.4s, v25.4h, v8.4h\n"
"smlal2 v10.4s, v25.8h, v8.8h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "smlal2 v9.4s, v25.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 77f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 76f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x8, #1, 78f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 78f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v9.4s, v29.8h, v8.8h\n"
- "tbz x8, #2, 81f\n"
- "ld1 { v22.4s }, [x12], #0x10\n"
- "ld1 { v31.4s }, [x11], #0x10\n"
- "tbz x8, #1, 80f\n"
- "ld1 { v19.d }[0], [x12], #0x8\n"
- "ld1 { v30.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
- "ld1 { v19.s }[2], [x12]\n"
- "ld1 { v30.s }[2], [x11]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "tbz x7, #2, 81f\n"
+ "ld1 { v18.4s }, [x13], #0x10\n"
+ "ld1 { v21.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v30.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 83f\n"
- "ld1 { v19.s }[0], [x12]\n"
- "ld1 { v30.s }[0], [x11]\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 82f\n"
- "ld1 { v22.d }[0], [x12], #0x8\n"
- "ld1 { v31.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
- "ld1 { v22.s }[2], [x12]\n"
- "ld1 { v31.s }[2], [x11]\n"
+ "tbz x7, #1, 82f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v21.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v21.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 83f\n"
- "ld1 { v22.s }[0], [x12]\n"
- "ld1 { v31.s }[0], [x11]\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v21.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v22.4s\n"
- "sqrdmulh v13.4s, v13.4s, v22.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v11.4s, v11.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v22.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v4.16b, v15.16b, v31.16b\n"
- "sqrdmulh v18.4s, v18.4s, v19.4s\n"
- "and v1.16b, v13.16b, v31.16b\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
- "and v22.16b, v11.16b, v31.16b\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "and v20.16b, v23.16b, v31.16b\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+ "and v2.16b, v15.16b, v21.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v13.4s, v13.4s, v30.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v16.16b, v13.16b, v31.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v17.16b, v21.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v28.16b, v11.16b, v21.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
+ "and v2.16b, v9.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v20.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v31.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v16.16b, v22.16b, v31.16b\n"
+ "sqadd v17.4s, v17.4s, v18.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v18.16b, v30.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v27.16b, v17.16b, v30.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v25.16b, v10.16b, v30.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v0.16b, v9.16b, v30.16b\n"
- "sqadd v15.4s, v15.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v1.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v22.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v31.4s\n"
- "sqadd v18.4s, v18.4s, v19.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v2.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v17.4s, v17.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v9.4s, v9.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v31.4s\n"
- "sqadd v17.4s, v17.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v31.4s\n"
- "sqadd v10.4s, v10.4s, v25.4s\n"
- "srshl v23.4s, v23.4s, v31.4s\n"
- "sqadd v9.4s, v9.4s, v0.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v10.4s, v10.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v9.4s, v9.4s, v30.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v18.4s\n"
- "sqxtn2 v13.8h, v17.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v15.8h, v13.4s\n"
+ "sqxtn2 v17.8h, v20.4s\n"
"sqxtn2 v11.8h, v10.4s\n"
- "sqxtn2 v23.8h, v9.4s\n"
+ "sqxtn2 v9.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v12.8h\n"
- "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v17.8h, v17.8h, v12.8h\n"
"sqadd v11.8h, v11.8h, v12.8h\n"
- "sqadd v23.8h, v23.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
"smax v15.8h, v15.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v17.8h, v17.8h, v14.8h\n"
"smax v11.8h, v11.8h, v14.8h\n"
- "smax v23.8h, v23.8h, v14.8h\n"
- "smin v15.8h, v15.8h, v21.8h\n"
- "smin v13.8h, v13.8h, v21.8h\n"
- "smin v11.8h, v11.8h, v21.8h\n"
- "smin v23.8h, v23.8h, v21.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v23.8h\n"
+ "smin v17.8h, v17.8h, v23.8h\n"
+ "smin v11.8h, v11.8h, v23.8h\n"
+ "smin v9.8h, v9.8h, v23.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x8, #2, 85f\n"
- "st1 { v15.s }[0], [x10], #0x4\n"
- "st1 { v13.s }[0], [x9], #0x4\n"
- "st1 { v11.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 84f\n"
- "st1 { v15.h }[2], [x10], #0x2\n"
- "st1 { v13.h }[2], [x9], #0x2\n"
- "st1 { v11.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[6], [x10], #0x1\n"
- "st1 { v13.b }[6], [x9], #0x1\n"
- "st1 { v11.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v15.s }[0], [x11], #0x4\n"
+ "st1 { v17.s }[0], [x10], #0x4\n"
+ "st1 { v11.s }[0], [x9], #0x4\n"
+ "st1 { v9.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "st1 { v17.h }[2], [x10], #0x2\n"
+ "st1 { v11.h }[2], [x9], #0x2\n"
+ "st1 { v9.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[6], [x11], #0x1\n"
+ "st1 { v17.b }[6], [x10], #0x1\n"
+ "st1 { v11.b }[6], [x9], #0x1\n"
+ "st1 { v9.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[4], [x10], #0x1\n"
- "st1 { v13.b }[4], [x9], #0x1\n"
- "st1 { v11.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[4], [x11], #0x1\n"
+ "st1 { v17.b }[4], [x10], #0x1\n"
+ "st1 { v11.b }[4], [x9], #0x1\n"
+ "st1 { v9.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 86f\n"
- "st1 { v15.h }[0], [x10], #0x2\n"
- "st1 { v13.h }[0], [x9], #0x2\n"
- "st1 { v11.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[2], [x10], #0x1\n"
- "st1 { v13.b }[2], [x9], #0x1\n"
- "st1 { v11.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "tbz x7, #1, 86f\n"
+ "st1 { v15.h }[0], [x11], #0x2\n"
+ "st1 { v17.h }[0], [x10], #0x2\n"
+ "st1 { v11.h }[0], [x9], #0x2\n"
+ "st1 { v9.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[2], [x11], #0x1\n"
+ "st1 { v17.b }[2], [x10], #0x1\n"
+ "st1 { v11.b }[2], [x9], #0x1\n"
+ "st1 { v9.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[0], [x10], #0x1\n"
- "st1 { v13.b }[0], [x9], #0x1\n"
- "st1 { v11.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[0], [x11], #0x1\n"
+ "st1 { v17.b }[0], [x10], #0x1\n"
+ "st1 { v11.b }[0], [x9], #0x1\n"
+ "st1 { v9.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 9ac7173b4c..eec3ba5900 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,1255 +111,1255 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x16, [%x[params], %[offsetof_Params_requant]]\n"
"ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x9, x16, %[offsetof_Requantize32_b_offset]\n"
- "add x19, x16, %[offsetof_Requantize32_c_offset]\n"
- "ldr x10, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x24, x16, %[offsetof_Requantize32_minval]\n"
- "add x2, x16, %[offsetof_Requantize32_maxval]\n"
- "ldr x8, [%x[params], %[offsetof_Params_weights]]\n"
- "ld1r { v15.16b }, [x9]\n"
- "ld1r { v16.8h }, [x19]\n"
- "lsr x3, x4, #0x3\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x9, x4, #0x3\n"
+ "add x24, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v9.16b }, [x24]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x28, x22, %[offsetof_Requantize32_c_offset]\n"
+ "add x24, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.8h }, [x28]\n"
+ "ld1r { v14.8h }, [x24]\n"
+ "add x20, x22, %[offsetof_Requantize32_maxval]\n"
+ "mov x3, #0x0\n"
+ "ld1r { v12.8h }, [x20]\n"
"mov x1, #0x0\n"
- "ld1r { v12.8h }, [x24]\n"
- "ld1r { v13.8h }, [x2]\n"
- "mov x2, #0x0\n"
- "add x0, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x21, x15, [x10, #0x0]\n"
- "ldp x17, x16, [x10, #0x10]\n"
- "cbz x3, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q11, [x19, #0x0]\n"
- "subs x3, x3, #0x1\n"
- "mov v14.16b, v11.16b\n"
- "ldr q21, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x8, #0x0]\n"
- "ldr d1, [x8, #0x8]\n"
- "ldr d2, [x8, #0x10]\n"
- "mov v10.16b, v21.16b\n"
- "mov v9.16b, v11.16b\n"
- "ldr d3, [x8, #0x18]\n"
- "ldr d4, [x8, #0x20]\n"
- "mov v8.16b, v21.16b\n"
- "mov v7.16b, v11.16b\n"
- "ldp x28, x27, [x0, #0x0]\n"
- "ldp x10, x26, [x0, #0x10]\n"
- "mov v6.16b, v21.16b\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldp x24, x23, [x0, #0x20]\n"
- "ldp x22, x25, [x0, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldp x20, x19, [x0, #0x40]\n"
- "ldr d31, [x28, x1]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr d30, [x27, x1]\n"
- "ldr d29, [x10, x1]\n"
+ "add x2, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x0, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x7, x8, [x25, #0x0]\n"
+ "ldp x17, x16, [x25, #0x10]\n"
+ "cbz x9, 3f\n"
+ "ldr d0, [x0, #0x0]\n"
+ "ldr d1, [x0, #0x8]\n"
+ "subs x9, x9, #0x1\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "ldr d2, [x0, #0x10]\n"
+ "ldr d3, [x0, #0x18]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d4, [x0, #0x20]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr q11, [x13, #0x0]\n"
+ "ldr q13, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x10, x28, [x2, #0x0]\n"
+ "ldp x27, x26, [x2, #0x10]\n"
+ "mov v20.16b, v11.16b\n"
+ "mov v19.16b, v13.16b\n"
+ "ldp x25, x24, [x2, #0x20]\n"
+ "ldp x23, x22, [x2, #0x30]\n"
+ "mov v8.16b, v11.16b\n"
+ "mov v7.16b, v13.16b\n"
+ "ldp x21, x20, [x2, #0x40]\n"
+ "ldr d31, [x10, x3]\n"
+ "mov v6.16b, v11.16b\n"
+ "mov v5.16b, v13.16b\n"
+ "ldr d30, [x28, x3]\n"
+ "ldr d29, [x27, x3]\n"
"ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr d28, [x26, x1]\n"
- "ldr d27, [x24, x1]\n"
+ "ldr d28, [x26, x3]\n"
+ "ldr d27, [x25, x3]\n"
"ushll v29.8h, v29.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr d23, [x23, x1]\n"
- "ldr d25, [x22, x1]\n"
+ "ldr d23, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
"ushll v27.8h, v27.8b, #0x0\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "ldr d24, [x25, x1]\n"
- "ldr d26, [x20, x1]\n"
+ "ldr d24, [x22, x3]\n"
+ "ldr d26, [x21, x3]\n"
"ushll v25.8h, v25.8b, #0x0\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "ldr d22, [x19, x1]\n"
+ "ldr d22, [x20, x3]\n"
"ushll v26.8h, v26.8b, #0x0\n"
"ushll v22.8h, v22.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
+ "ldr q18, [x6, #0x0]\n"
+ "ldr q21, [x5, #0x0]\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr x19, [x0, #0x50]\n"
- "ldr d31, [x19, x1]\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal v9.4s, v29.4h, v0.4h\n"
- "ldr x20, [x0, #0x58]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v7.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x19, [x0, #0x60]\n"
- "ldr x24, [x0, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "ldr q16, [x6, #0x10]\n"
+ "ldr q10, [x5, #0x10]\n"
"smlal v11.4s, v30.4h, v1.4h\n"
- "ldr x23, [x0, #0x70]\n"
- "ldr x26, [x0, #0x78]\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "smlal2 v6.4s, v28.8h, v0.8h\n"
- "ldr d30, [x20, x1]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v1.4h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "ldr d0, [x8, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr x7, [x0, #0x80]\n"
- "ldr x22, [x0, #0x88]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ldr x20, [x2, #0x50]\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x2, #0x58]\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x3]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x20, [x0, #0x90]\n"
- "ldr x14, [x0, #0x98]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal2 v6.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x1]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal v9.4s, v23.4h, v2.4h\n"
- "ldr d1, [x8, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr x19, [x0, #0xa0]\n"
- "ldr x13, [x0, #0xa8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x21, [x2, #0x60]\n"
+ "ldr x20, [x2, #0x68]\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x3]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x2, #0x70]\n"
+ "ldr x26, [x2, #0x78]\n"
+ "smlal2 v13.4s, v27.8h, v2.8h\n"
+ "smlal2 v19.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x0, #0x28]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x12, [x0, #0xb0]\n"
- "ldr x11, [x0, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v31.8h, v2.8h\n"
- "ldr d25, [x24, x1]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "ldr d2, [x8, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr x10, [x0, #0xc0]\n"
- "ldr x9, [x0, #0xc8]\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x28, [x0, #0xd0]\n"
- "ldr x27, [x0, #0xd8]\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "smlal2 v6.4s, v30.8h, v3.8h\n"
- "ldr d24, [x23, x1]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "ldr d3, [x8, #0x40]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x1]\n"
+ "ldr x23, [x2, #0x80]\n"
+ "ldr x24, [x2, #0x88]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x3]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "ldr x15, [x2, #0x90]\n"
+ "ldr x21, [x2, #0x98]\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x0, #0x30]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "ldr x14, [x2, #0xa0]\n"
+ "ldr x13, [x2, #0xa8]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x3]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "ldr x12, [x2, #0xb0]\n"
+ "ldr x20, [x2, #0xb8]\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x0, #0x38]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x26, [x0, #0xe0]\n"
- "ldr x25, [x0, #0xe8]\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "ldr d4, [x8, #0x48]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
- "smlal v9.4s, v22.4h, v0.4h\n"
- "ldr x24, [x0, #0xf0]\n"
- "ldr q17, [x5, #0x0]\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr q5, [x6, #0x0]\n"
- "ldr q18, [x5, #0x10]\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "ldr x11, [x2, #0xc0]\n"
+ "ldr x10, [x2, #0xc8]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x25, x3]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "ldr x22, [x2, #0xd0]\n"
+ "ldr x28, [x2, #0xd8]\n"
+ "smlal2 v13.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x0, #0x40]\n"
+ "smlal2 v19.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x26, x3]\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v11.4s, v28.4h, v1.4h\n"
- "ldr q29, [x6, #0x10]\n"
- "subs x3, x3, #0x1\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "smlal2 v6.4s, v25.8h, v0.8h\n"
- "ldr d28, [x22, x1]\n"
- "ldr d0, [x8, #0x50]\n"
- "smlal v14.4s, v23.4h, v1.4h\n"
- "smlal v9.4s, v25.4h, v1.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x27, [x2, #0xe0]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x0, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x26, [x2, #0xe8]\n"
+ "smlal2 v13.4s, v28.8h, v1.8h\n"
+ "smlal2 v19.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x3]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x23, [x0, #0xf8]\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "add x5, x5, #0x20\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v11.4s, v23.4h, v2.4h\n"
+ "ldr x25, [x2, #0xf0]\n"
+ "subs x9, x9, #0x1\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x0, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
"add x6, x6, #0x20\n"
- "smlal2 v21.4s, v23.8h, v2.8h\n"
- "ldr d23, [x7, x1]\n"
- "smlal2 v6.4s, v24.8h, v1.8h\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v13.4s, v23.8h, v2.8h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x3]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal v9.4s, v24.4h, v2.4h\n"
- "ldr d1, [x8, #0x58]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr x22, [x0, #0x100]\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal2 v6.4s, v27.8h, v2.8h\n"
- "ldr d31, [x20, x1]\n"
+ "ldr x24, [x2, #0xf8]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x0, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v13.4s, v31.8h, v3.8h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x3]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v3.4h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "ldr d2, [x8, #0x60]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr x7, [x0, #0x108]\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v11.4s, v30.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v4.8h\n"
- "ldr d30, [x14, x1]\n"
- "smlal2 v6.4s, v23.8h, v3.8h\n"
+ "ldr x23, [x2, #0x100]\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x0, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v13.4s, v30.8h, v4.8h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x3]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "smlal v9.4s, v23.4h, v4.4h\n"
- "ldr d3, [x8, #0x68]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x19, x1]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x20, [x0, #0x110]\n"
- "ldr x19, [x0, #0x118]\n"
- "smlal2 v21.4s, v22.8h, v0.8h\n"
- "smlal2 v6.4s, v28.8h, v4.8h\n"
- "ldr d4, [x8, #0x70]\n"
- "ldr d22, [x11, x1]\n"
- "smlal v14.4s, v25.4h, v0.4h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "ldr x15, [x2, #0x108]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x0, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v13.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x3]\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x3]\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v21.4s, v25.8h, v1.8h\n"
- "ldr d25, [x13, x1]\n"
- "smlal2 v6.4s, v30.8h, v0.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x21, [x2, #0x110]\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x0, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr x20, [x2, #0x118]\n"
+ "smlal2 v13.4s, v25.8h, v1.8h\n"
+ "smlal2 v19.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x3]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v1.4h\n"
- "smlal v9.4s, v30.4h, v1.4h\n"
- "ldr d0, [x8, #0x78]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v24.8h, v2.8h\n"
- "ldr d24, [x12, x1]\n"
- "smlal2 v6.4s, v26.8h, v1.8h\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x0, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "smlal2 v19.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x3]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v2.4h\n"
- "smlal v9.4s, v26.4h, v2.4h\n"
- "ldr d1, [x8, #0x80]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v27.8h, v3.8h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "ldr d27, [x10, x1]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x0, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v13.4s, v27.8h, v3.8h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x3]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v14.4s, v23.4h, v3.4h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "ldr d2, [x8, #0x88]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x1]\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x0, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v13.4s, v23.8h, v4.8h\n"
+ "smlal2 v19.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x10, x3]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "ldr d3, [x8, #0x90]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x26, x1]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr d31, [x28, x1]\n"
- "smlal2 v6.4s, v22.8h, v4.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal v9.4s, v27.4h, v0.4h\n"
- "ldr d4, [x8, #0x98]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x0, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x22, x3]\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x3]\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x1]\n"
- "smlal2 v6.4s, v23.8h, v0.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x0, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x3]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v26.4h, v1.4h\n"
- "smlal v9.4s, v23.4h, v1.4h\n"
- "ldr d0, [x8, #0xa0]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v26.8h, v2.8h\n"
- "smlal2 v6.4s, v31.8h, v1.8h\n"
- "ldr d26, [x25, x1]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x0, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v13.4s, v26.8h, v2.8h\n"
+ "smlal2 v19.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x3]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "ldr d1, [x8, #0xa8]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v30.8h, v2.8h\n"
- "ldr d25, [x24, x1]\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x0, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x3]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v3.4h\n"
- "ldr d2, [x8, #0xb0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x1]\n"
- "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x0, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x3]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v22.4h, v4.4h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "ldr d3, [x8, #0xb8]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x1]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x0, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v13.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x3]\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "ldr d4, [x8, #0xc0]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v14.4s, v23.4h, v0.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "add x8, x8, #0xc8\n"
- "smlal v7.4s, v24.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x7, x1]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v6.4s, v24.8h, v0.8h\n"
"smlal v11.4s, v23.4h, v1.4h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal v7.4s, v27.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x20, x1]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v6.4s, v27.8h, v1.8h\n"
+ "smlal2 v19.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x0, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "add x0, x0, #0xc8\n"
+ "smlal2 v13.4s, v23.8h, v1.8h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x15, x3]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
"smlal v11.4s, v31.4h, v2.4h\n"
- "smlal v14.4s, v30.4h, v2.4h\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x19, x1]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "add x1, x1, #0x8\n"
+ "smlal2 v19.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v13.4s, v31.8h, v2.8h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x21, x3]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
"smlal v11.4s, v30.4h, v3.4h\n"
- "smlal v14.4s, v28.4h, v3.4h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal2 v19.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v13.4s, v30.8h, v3.8h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x20, x3]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
"smlal v11.4s, v28.4h, v4.4h\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "sqrdmulh v11.4s, v11.4s, v17.4s\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "smlal v7.4s, v27.4h, v4.4h\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal2 v6.4s, v27.8h, v4.8h\n"
- "sqrdmulh v7.4s, v7.4s, v17.4s\n"
- "and v23.16b, v11.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v18.4s\n"
- "and v22.16b, v14.16b, v5.16b\n"
- "sqrdmulh v10.4s, v10.4s, v18.4s\n"
- "and v17.16b, v9.16b, v5.16b\n"
+ "smlal2 v19.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v31.16b, v11.16b, v21.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "smlal2 v13.4s, v28.8h, v4.8h\n"
+ "smlal2 v19.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "and v17.16b, v13.16b, v10.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
"sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v20.16b, v7.16b, v5.16b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
"sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v19.16b, v21.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v10.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v31.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v8.16b, v29.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v4.16b, v6.16b, v29.16b\n"
- "sqadd v11.4s, v11.4s, v23.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
+ "and v26.16b, v20.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "and v18.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+ "and v31.16b, v6.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v17.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v7.4s, v7.4s, v20.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v5.4s\n"
- "sqadd v8.4s, v8.4s, v26.4s\n"
- "srshl v7.4s, v7.4s, v5.4s\n"
- "sqadd v6.4s, v6.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
+ "and v27.16b, v19.16b, v10.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v25.16b, v7.16b, v10.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v17.16b, v5.16b, v10.16b\n"
+ "sqadd v20.4s, v20.4s, v26.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v31.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v27.4s\n"
+ "srshl v8.4s, v8.4s, v21.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "srshl v6.4s, v6.4s, v21.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v10.4s, v10.4s, v29.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v6.4s, v6.4s, v29.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "sqxtn2 v11.8h, v21.4s\n"
- "sqxtn2 v14.8h, v10.4s\n"
- "sqxtn2 v9.8h, v8.4s\n"
- "sqxtn2 v7.8h, v6.4s\n"
- "sqadd v11.8h, v11.8h, v16.8h\n"
- "sqadd v14.8h, v14.8h, v16.8h\n"
- "sqadd v9.8h, v9.8h, v16.8h\n"
- "sqadd v7.8h, v7.8h, v16.8h\n"
- "smax v11.8h, v11.8h, v12.8h\n"
- "smax v14.8h, v14.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smin v11.8h, v11.8h, v13.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v9.8h, v9.8h, v13.8h\n"
- "smin v7.8h, v7.8h, v13.8h\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v5.4s, v5.4s, v10.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v11.8h, v13.4s\n"
+ "sqxtn2 v20.8h, v19.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v11.8h, v11.8h, v15.8h\n"
+ "sqadd v20.8h, v20.8h, v15.8h\n"
+ "sqadd v8.8h, v8.8h, v15.8h\n"
+ "sqadd v6.8h, v6.8h, v15.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v11.8h, v11.8h, v12.8h\n"
+ "smin v20.8h, v20.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d11, [x21, x2]\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d14, [x15, x2]\n"
- "str d9, [x17, x2]\n"
- "str d7, [x16, x2]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q11, [x19, #0x0]\n"
- "add x2, x2, #0x8\n"
- "ldr q21, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x8, #0x0]\n"
- "ldr d1, [x8, #0x8]\n"
- "ldr d2, [x8, #0x10]\n"
- "mov v14.16b, v11.16b\n"
- "mov v10.16b, v21.16b\n"
- "ldr d3, [x8, #0x18]\n"
- "ldr d4, [x8, #0x20]\n"
- "mov v9.16b, v11.16b\n"
- "mov v8.16b, v21.16b\n"
- "ldp x28, x27, [x0, #0x0]\n"
- "ldp x10, x26, [x0, #0x10]\n"
- "mov v7.16b, v11.16b\n"
- "mov v6.16b, v21.16b\n"
- "ldp x24, x23, [x0, #0x20]\n"
- "ldp x22, x25, [x0, #0x30]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x20, x19, [x0, #0x40]\n"
- "ldr d31, [x28, x1]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr d30, [x27, x1]\n"
- "ldr d29, [x10, x1]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d11, [x7, x1]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d20, [x8, x1]\n"
+ "str d8, [x17, x1]\n"
+ "str d6, [x16, x1]\n"
+ "ldr q11, [x13, #0x0]\n"
+ "ldr q13, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "ldr d0, [x0, #0x0]\n"
+ "ldr d1, [x0, #0x8]\n"
+ "add x1, x1, #0x8\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x0, #0x10]\n"
+ "ldr d3, [x0, #0x18]\n"
+ "mov v20.16b, v11.16b\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr d4, [x0, #0x20]\n"
+ "ldp x10, x28, [x2, #0x0]\n"
+ "mov v8.16b, v11.16b\n"
+ "mov v7.16b, v13.16b\n"
+ "ldp x27, x26, [x2, #0x10]\n"
+ "ldp x25, x24, [x2, #0x20]\n"
+ "mov v6.16b, v11.16b\n"
+ "mov v5.16b, v13.16b\n"
+ "ldp x23, x22, [x2, #0x30]\n"
+ "ldp x21, x20, [x2, #0x40]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr d31, [x10, x3]\n"
+ "ldr d30, [x28, x3]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d29, [x27, x3]\n"
+ "ldr d28, [x26, x3]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr d28, [x26, x1]\n"
- "ldr d27, [x24, x1]\n"
+ "ldr d27, [x25, x3]\n"
+ "ldr d23, [x24, x3]\n"
"ushll v30.8h, v30.8b, #0x0\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "ldr d23, [x23, x1]\n"
- "ldr d25, [x22, x1]\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d24, [x22, x3]\n"
"ushll v28.8h, v28.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr d24, [x25, x1]\n"
- "ldr d26, [x20, x1]\n"
+ "ldr d26, [x21, x3]\n"
+ "ldr d22, [x20, x3]\n"
"ushll v23.8h, v23.8b, #0x0\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "ldr d22, [x19, x1]\n"
"ushll v24.8h, v24.8b, #0x0\n"
"ushll v26.8h, v26.8b, #0x0\n"
"ushll v22.8h, v22.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
+ "ldr q18, [x6, #0x0]\n"
+ "ldr q21, [x5, #0x0]\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr x19, [x0, #0x50]\n"
- "ldr d31, [x19, x1]\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal v9.4s, v29.4h, v0.4h\n"
- "ldr x20, [x0, #0x58]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v7.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x19, [x0, #0x60]\n"
- "ldr x24, [x0, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "ldr q16, [x6, #0x10]\n"
+ "ldr q10, [x5, #0x10]\n"
"smlal v11.4s, v30.4h, v1.4h\n"
- "ldr x23, [x0, #0x70]\n"
- "ldr x26, [x0, #0x78]\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "smlal2 v6.4s, v28.8h, v0.8h\n"
- "ldr d30, [x20, x1]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v1.4h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "ldr d0, [x8, #0x28]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "ldr x7, [x0, #0x80]\n"
- "ldr x22, [x0, #0x88]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ldr x20, [x2, #0x50]\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x2, #0x58]\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x3]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v11.4s, v27.4h, v2.4h\n"
- "ldr x20, [x0, #0x90]\n"
- "ldr x14, [x0, #0x98]\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal2 v6.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x1]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal v9.4s, v23.4h, v2.4h\n"
- "ldr d1, [x8, #0x30]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "ldr x19, [x0, #0xa0]\n"
- "ldr x13, [x0, #0xa8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x21, [x2, #0x60]\n"
+ "ldr x20, [x2, #0x68]\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x3]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x2, #0x70]\n"
+ "ldr x26, [x2, #0x78]\n"
+ "smlal2 v13.4s, v27.8h, v2.8h\n"
+ "smlal2 v19.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x0, #0x28]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "ldr x12, [x0, #0xb0]\n"
- "ldr x11, [x0, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v31.8h, v2.8h\n"
- "ldr d25, [x24, x1]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "ldr d2, [x8, #0x38]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "ldr x10, [x0, #0xc0]\n"
- "ldr x9, [x0, #0xc8]\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "ldr x28, [x0, #0xd0]\n"
- "ldr x27, [x0, #0xd8]\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "smlal2 v6.4s, v30.8h, v3.8h\n"
- "ldr d24, [x23, x1]\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "ldr d3, [x8, #0x40]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "ldr d27, [x26, x1]\n"
+ "ldr x23, [x2, #0x80]\n"
+ "ldr x24, [x2, #0x88]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x3]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "ldr x15, [x2, #0x90]\n"
+ "ldr x21, [x2, #0x98]\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x0, #0x30]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "ldr x14, [x2, #0xa0]\n"
+ "ldr x13, [x2, #0xa8]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x3]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "ldr x12, [x2, #0xb0]\n"
+ "ldr x20, [x2, #0xb8]\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x0, #0x38]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x26, [x0, #0xe0]\n"
- "ldr x25, [x0, #0xe8]\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "ldr d4, [x8, #0x48]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
- "smlal v9.4s, v22.4h, v0.4h\n"
- "ldr x24, [x0, #0xf0]\n"
- "ldr x23, [x0, #0xf8]\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "ldr q17, [x5, #0x0]\n"
- "ldr q5, [x6, #0x0]\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "ldr x11, [x2, #0xc0]\n"
+ "ldr x10, [x2, #0xc8]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x25, x3]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "ldr x22, [x2, #0xd0]\n"
+ "ldr x28, [x2, #0xd8]\n"
+ "smlal2 v13.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x0, #0x40]\n"
+ "smlal2 v19.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x26, x3]\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v11.4s, v28.4h, v1.4h\n"
- "ldr q18, [x5, #0x10]\n"
- "ldr q29, [x6, #0x10]\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "smlal2 v6.4s, v25.8h, v0.8h\n"
- "ldr d28, [x22, x1]\n"
- "ldr d0, [x8, #0x50]\n"
- "smlal v14.4s, v23.4h, v1.4h\n"
- "smlal v9.4s, v25.4h, v1.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x27, [x2, #0xe0]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x0, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x26, [x2, #0xe8]\n"
+ "smlal2 v13.4s, v28.8h, v1.8h\n"
+ "smlal2 v19.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x3]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ldr x22, [x0, #0x100]\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "tst x4, #0x7\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v11.4s, v23.4h, v2.4h\n"
- "add x5, x5, #0x20\n"
+ "ldr x25, [x2, #0xf0]\n"
+ "ldr x24, [x2, #0xf8]\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x0, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "tst x4, #0x7\n"
"add x6, x6, #0x20\n"
- "smlal2 v21.4s, v23.8h, v2.8h\n"
- "ldr d23, [x7, x1]\n"
- "smlal2 v6.4s, v24.8h, v1.8h\n"
+ "smlal2 v13.4s, v23.8h, v2.8h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x3]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal v9.4s, v24.4h, v2.4h\n"
- "ldr d1, [x8, #0x58]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "ldr x7, [x0, #0x108]\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal2 v6.4s, v27.8h, v2.8h\n"
- "ldr d31, [x20, x1]\n"
+ "ldr x23, [x2, #0x100]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x0, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v13.4s, v31.8h, v3.8h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x3]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v3.4h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "ldr d2, [x8, #0x60]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "ldr x20, [x0, #0x110]\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v11.4s, v30.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v4.8h\n"
- "ldr d30, [x14, x1]\n"
- "smlal2 v6.4s, v23.8h, v3.8h\n"
+ "ldr x15, [x2, #0x108]\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x0, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v13.4s, v30.8h, v4.8h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x3]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "smlal v9.4s, v23.4h, v4.4h\n"
- "ldr d3, [x8, #0x68]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "ldr d26, [x19, x1]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v11.4s, v22.4h, v0.4h\n"
- "ldr x19, [x0, #0x118]\n"
- "smlal2 v21.4s, v22.8h, v0.8h\n"
- "smlal2 v6.4s, v28.8h, v4.8h\n"
- "ldr d4, [x8, #0x70]\n"
- "ldr d22, [x11, x1]\n"
- "smlal v14.4s, v25.4h, v0.4h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
- "ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "ldr x21, [x2, #0x110]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x0, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v13.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x3]\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x3]\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v21.4s, v25.8h, v1.8h\n"
- "ldr d25, [x13, x1]\n"
- "smlal2 v6.4s, v30.8h, v0.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x2, #0x118]\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x0, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v13.4s, v25.8h, v1.8h\n"
+ "smlal2 v19.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x3]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v1.4h\n"
- "smlal v9.4s, v30.4h, v1.4h\n"
- "ldr d0, [x8, #0x78]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v24.8h, v2.8h\n"
- "ldr d24, [x12, x1]\n"
- "smlal2 v6.4s, v26.8h, v1.8h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x0, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "smlal2 v19.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x3]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v27.4h, v2.4h\n"
- "smlal v9.4s, v26.4h, v2.4h\n"
- "ldr d1, [x8, #0x80]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v27.8h, v3.8h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "ldr d27, [x10, x1]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x0, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v13.4s, v27.8h, v3.8h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x3]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v14.4s, v23.4h, v3.4h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "ldr d2, [x8, #0x88]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x1]\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x0, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v13.4s, v23.8h, v4.8h\n"
+ "smlal2 v19.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x10, x3]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "ldr d3, [x8, #0x90]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "ldr d28, [x26, x1]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr d31, [x28, x1]\n"
- "smlal2 v6.4s, v22.8h, v4.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal v9.4s, v27.4h, v0.4h\n"
- "ldr d4, [x8, #0x98]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v7.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x0, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x22, x3]\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x3]\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x1]\n"
- "smlal2 v6.4s, v23.8h, v0.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x0, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x3]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "smlal v14.4s, v26.4h, v1.4h\n"
- "smlal v9.4s, v23.4h, v1.4h\n"
- "ldr d0, [x8, #0xa0]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "smlal v7.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v26.8h, v2.8h\n"
- "smlal2 v6.4s, v31.8h, v1.8h\n"
- "ldr d26, [x25, x1]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x0, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v13.4s, v26.8h, v2.8h\n"
+ "smlal2 v19.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x3]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "ldr d1, [x8, #0xa8]\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v30.8h, v2.8h\n"
- "ldr d25, [x24, x1]\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x0, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x3]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v3.4h\n"
- "ldr d2, [x8, #0xb0]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "smlal v7.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x1]\n"
- "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x0, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x3]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal v14.4s, v22.4h, v4.4h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "ldr d3, [x8, #0xb8]\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x1]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x0, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v13.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x3]\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "ldr d4, [x8, #0xc0]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "smlal v14.4s, v23.4h, v0.4h\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal v7.4s, v24.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x7, x1]\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
- "smlal2 v6.4s, v24.8h, v0.8h\n"
"smlal v11.4s, v23.4h, v1.4h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal v7.4s, v27.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x20, x1]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal2 v6.4s, v27.8h, v1.8h\n"
+ "smlal2 v19.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x0, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v13.4s, v23.8h, v1.8h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x15, x3]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
"smlal v11.4s, v31.4h, v2.4h\n"
- "smlal v14.4s, v30.4h, v2.4h\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x19, x1]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "add x1, x1, #0x8\n"
+ "smlal2 v19.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v13.4s, v31.8h, v2.8h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x21, x3]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
"smlal v11.4s, v30.4h, v3.4h\n"
- "smlal v14.4s, v28.4h, v3.4h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal2 v19.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v13.4s, v30.8h, v3.8h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x20, x3]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
"smlal v11.4s, v28.4h, v4.4h\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "sqrdmulh v11.4s, v11.4s, v17.4s\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "smlal v7.4s, v27.4h, v4.4h\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal2 v6.4s, v27.8h, v4.8h\n"
- "sqrdmulh v7.4s, v7.4s, v17.4s\n"
- "and v23.16b, v11.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v18.4s\n"
- "and v22.16b, v14.16b, v5.16b\n"
- "sqrdmulh v10.4s, v10.4s, v18.4s\n"
- "and v17.16b, v9.16b, v5.16b\n"
+ "smlal2 v19.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v31.16b, v11.16b, v21.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "smlal2 v13.4s, v28.8h, v4.8h\n"
+ "smlal2 v19.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "and v17.16b, v13.16b, v10.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
"sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v20.16b, v7.16b, v5.16b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
"sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v19.16b, v21.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v10.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v31.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v8.16b, v29.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v4.16b, v6.16b, v29.16b\n"
- "sqadd v11.4s, v11.4s, v23.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
+ "and v26.16b, v20.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "and v18.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+ "and v31.16b, v6.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v17.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v7.4s, v7.4s, v20.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v5.4s\n"
- "sqadd v8.4s, v8.4s, v26.4s\n"
- "srshl v7.4s, v7.4s, v5.4s\n"
- "sqadd v6.4s, v6.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
+ "and v27.16b, v19.16b, v10.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v25.16b, v7.16b, v10.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v17.16b, v5.16b, v10.16b\n"
+ "sqadd v20.4s, v20.4s, v26.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v31.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v27.4s\n"
+ "srshl v8.4s, v8.4s, v21.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "srshl v6.4s, v6.4s, v21.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v10.4s, v10.4s, v29.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v6.4s, v6.4s, v29.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "sqxtn2 v11.8h, v21.4s\n"
- "sqxtn2 v14.8h, v10.4s\n"
- "sqxtn2 v9.8h, v8.4s\n"
- "sqxtn2 v7.8h, v6.4s\n"
- "sqadd v11.8h, v11.8h, v16.8h\n"
- "sqadd v14.8h, v14.8h, v16.8h\n"
- "sqadd v9.8h, v9.8h, v16.8h\n"
- "sqadd v7.8h, v7.8h, v16.8h\n"
- "smax v11.8h, v11.8h, v12.8h\n"
- "smax v14.8h, v14.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smin v11.8h, v11.8h, v13.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v9.8h, v9.8h, v13.8h\n"
- "smin v7.8h, v7.8h, v13.8h\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v5.4s, v5.4s, v10.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v11.8h, v13.4s\n"
+ "sqxtn2 v20.8h, v19.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v11.8h, v11.8h, v15.8h\n"
+ "sqadd v20.8h, v20.8h, v15.8h\n"
+ "sqadd v8.8h, v8.8h, v15.8h\n"
+ "sqadd v6.8h, v6.8h, v15.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v11.8h, v11.8h, v12.8h\n"
+ "smin v20.8h, v20.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "str d11, [x21, x2]\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d14, [x15, x2]\n"
- "str d9, [x17, x2]\n"
- "str d7, [x16, x2]\n"
- "add x2, x2, #0x8\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d11, [x7, x1]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d20, [x8, x1]\n"
+ "str d8, [x17, x1]\n"
+ "str d6, [x16, x1]\n"
+ "add x1, x1, #0x8\n"
"beq 124f\n"
- "add x8, x8, #0xc8\n"
+ "add x0, x0, #0xc8\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x4, #2, 5f\n"
- "ld1 { v11.4s }, [x19], #0x10\n"
+ "ld1 { v11.4s }, [x13], #0x10\n"
"tbz x4, #1, 4f\n"
- "ld1 { v21.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x13], #0x8\n"
"tbz x4, #0, 7f\n"
- "ld1 { v21.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x13]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x4, #0, 7f\n"
- "ld1 { v21.s }[0], [x19]\n"
+ "ld1 { v13.s }[0], [x13]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x4, #1, 6f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v11.d }[0], [x13], #0x8\n"
"tbz x4, #0, 7f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v11.s }[2], [x13]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 7f\n"
- "ld1 { v11.s }[0], [x19]\n"
+ "ld1 { v11.s }[0], [x13]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x8, #0x0]\n"
- "ldr d1, [x8, #0x8]\n"
- "mov v14.16b, v11.16b\n"
- "mov v10.16b, v21.16b\n"
- "ldr d2, [x8, #0x10]\n"
- "ldr d3, [x8, #0x18]\n"
- "mov v9.16b, v11.16b\n"
- "mov v8.16b, v21.16b\n"
- "ldr d4, [x8, #0x20]\n"
- "ldp x28, x27, [x0, #0x0]\n"
- "mov v7.16b, v11.16b\n"
- "mov v6.16b, v21.16b\n"
- "ldp x10, x26, [x0, #0x10]\n"
- "ldp x24, x23, [x0, #0x20]\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldp x22, x25, [x0, #0x30]\n"
- "ldp x20, x19, [x0, #0x40]\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "add x28, x28, x1\n"
- "add x27, x27, x1\n"
- "add x10, x10, x1\n"
- "add x26, x26, x1\n"
- "add x24, x24, x1\n"
- "add x23, x23, x1\n"
- "add x22, x22, x1\n"
- "add x25, x25, x1\n"
- "add x20, x20, x1\n"
- "add x19, x19, x1\n"
+ "ldr d0, [x0, #0x0]\n"
+ "ldr d1, [x0, #0x8]\n"
+ "mov v20.16b, v11.16b\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr d2, [x0, #0x10]\n"
+ "ldr d3, [x0, #0x18]\n"
+ "mov v8.16b, v11.16b\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr d4, [x0, #0x20]\n"
+ "ldp x10, x28, [x2, #0x0]\n"
+ "mov v6.16b, v11.16b\n"
+ "mov v5.16b, v13.16b\n"
+ "ldp x27, x26, [x2, #0x10]\n"
+ "ldp x25, x24, [x2, #0x20]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldp x23, x22, [x2, #0x30]\n"
+ "ldp x21, x20, [x2, #0x40]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "add x10, x10, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
"tbz x4, #2, 9f\n"
- "ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "ld1 { v29.s }[0], [x10], #0x4\n"
+ "ld1 { v31.s }[0], [x10], #0x4\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x27], #0x4\n"
"ld1 { v28.s }[0], [x26], #0x4\n"
- "ld1 { v27.s }[0], [x24], #0x4\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "ld1 { v25.s }[0], [x22], #0x4\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "ld1 { v22.s }[0], [x19], #0x4\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
"tbz x4, #1, 8f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v29.h }[2], [x10], #0x2\n"
+ "ld1 { v31.h }[2], [x10], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
"ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "ld1 { v22.h }[2], [x19], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz x4, #0, 11f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v29.b }[6], [x10]\n"
+ "ld1 { v31.b }[6], [x10]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
"ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
- "ld1 { v24.b }[6], [x25]\n"
- "ld1 { v26.b }[6], [x20]\n"
- "ld1 { v22.b }[6], [x19]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x4, #0, 11f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v29.b }[4], [x10]\n"
+ "ld1 { v31.b }[4], [x10]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
"ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
- "ld1 { v24.b }[4], [x25]\n"
- "ld1 { v26.b }[4], [x20]\n"
- "ld1 { v22.b }[4], [x19]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x4, #1, 10f\n"
- "ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "ld1 { v29.h }[0], [x10], #0x2\n"
+ "ld1 { v31.h }[0], [x10], #0x2\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x27], #0x2\n"
"ld1 { v28.h }[0], [x26], #0x2\n"
- "ld1 { v27.h }[0], [x24], #0x2\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "ld1 { v25.h }[0], [x22], #0x2\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "ld1 { v22.h }[0], [x19], #0x2\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
"tbz x4, #0, 11f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v29.b }[2], [x10]\n"
+ "ld1 { v31.b }[2], [x10]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
"ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
- "ld1 { v24.b }[2], [x25]\n"
- "ld1 { v26.b }[2], [x20]\n"
- "ld1 { v22.b }[2], [x19]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 11f\n"
- "ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
- "ld1 { v29.b }[0], [x10]\n"
+ "ld1 { v31.b }[0], [x10]\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x27]\n"
"ld1 { v28.b }[0], [x26]\n"
- "ld1 { v27.b }[0], [x24]\n"
- "ld1 { v23.b }[0], [x23]\n"
- "ld1 { v25.b }[0], [x22]\n"
- "ld1 { v24.b }[0], [x25]\n"
- "ld1 { v26.b }[0], [x20]\n"
- "ld1 { v22.b }[0], [x19]\n"
+ "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v22.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
"ushll v30.8h, v30.8b, #0x0\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "ldr x19, [x0, #0x50]\n"
+ "ldr x20, [x2, #0x50]\n"
"ushll v29.8h, v29.8b, #0x0\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v9.4s, v29.4h, v0.4h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "add x19, x19, x1\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "add x20, x20, x3\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v7.4s, v28.4h, v0.4h\n"
- "smlal2 v6.4s, v28.8h, v0.8h\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
"smlal v11.4s, v30.4h, v1.4h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "smlal v14.4s, v27.4h, v1.4h\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "smlal2 v10.4s, v27.8h, v1.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v27.8h, v1.8h\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
"ushll v22.8h, v22.8b, #0x0\n"
- "smlal2 v6.4s, v23.8h, v1.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
"smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v9.4s, v23.4h, v2.4h\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "smlal2 v13.4s, v27.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x19], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x19], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x19]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x19]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x19], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x19]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x19]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr x20, [x0, #0x58]\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v6.4s, v31.8h, v2.8h\n"
+ "ldr x22, [x2, #0x58]\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "add x20, x20, x1\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v9.4s, v31.4h, v3.4h\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "add x22, x22, x3\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
"tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
"tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
"tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "ld1 { v30.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"ushll v30.8h, v30.8b, #0x0\n"
- "ldr x19, [x0, #0x60]\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v6.4s, v30.8h, v3.8h\n"
+ "ldr x21, [x2, #0x60]\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
"smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "add x19, x19, x1\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "add x21, x21, x3\n"
"tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
"tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
"tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
"tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d0, [x0, #0x28]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr d0, [x8, #0x28]\n"
- "smlal v14.4s, v27.4h, v4.4h\n"
- "smlal2 v10.4s, v27.8h, v4.8h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x24, [x0, #0x68]\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "add x24, x24, x1\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v4.8h\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "ldr x20, [x2, #0x68]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x3\n"
"smlal v11.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "smlal v14.4s, v28.4h, v0.4h\n"
- "smlal2 v10.4s, v28.8h, v0.8h\n"
- "smlal v9.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal2 v13.4s, v29.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "smlal2 v19.4s, v28.8h, v0.8h\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
"tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
"tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
"tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x8, #0x30]\n"
+ "ldr d1, [x0, #0x30]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x23, [x0, #0x70]\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "smlal2 v6.4s, v25.8h, v0.8h\n"
- "add x23, x23, x1\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr x25, [x2, #0x70]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "add x25, x25, x3\n"
"smlal v11.4s, v28.4h, v1.4h\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "smlal v14.4s, v23.4h, v1.4h\n"
- "smlal2 v10.4s, v23.8h, v1.8h\n"
- "smlal v9.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal2 v13.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x25], #0x4\n"
"tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
"tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x25]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x25]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x25], #0x2\n"
"tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x25]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x25]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x8, #0x38]\n"
+ "ldr d2, [x0, #0x38]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x26, [x0, #0x78]\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v6.4s, v24.8h, v1.8h\n"
- "add x26, x26, x1\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x26, [x2, #0x78]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "add x26, x26, x3\n"
"smlal v11.4s, v23.4h, v2.4h\n"
- "smlal2 v21.4s, v23.8h, v2.8h\n"
- "smlal v14.4s, v31.4h, v2.4h\n"
- "smlal2 v10.4s, v31.8h, v2.8h\n"
- "smlal v9.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal2 v13.4s, v23.8h, v2.8h\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"tbz x4, #2, 33f\n"
"ld1 { v27.s }[0], [x26], #0x4\n"
"tbz x4, #1, 32f\n"
@@ -1381,179 +1381,179 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x4, #0, 35f\n"
"ld1 { v27.b }[0], [x26]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x8, #0x40]\n"
+ "ldr d3, [x0, #0x40]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x7, [x0, #0x80]\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v6.4s, v27.8h, v2.8h\n"
- "add x7, x7, x1\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x23, [x2, #0x80]\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "add x23, x23, x3\n"
"smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal v14.4s, v30.4h, v3.4h\n"
- "smlal2 v10.4s, v30.8h, v3.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal2 v13.4s, v31.8h, v3.8h\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x7], #0x4\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
"tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x7], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
"tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x7]\n"
+ "ld1 { v23.b }[6], [x23]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x7]\n"
+ "ld1 { v23.b }[4], [x23]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x7], #0x2\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
"tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x7]\n"
+ "ld1 { v23.b }[2], [x23]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x7]\n"
+ "ld1 { v23.b }[0], [x23]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x8, #0x48]\n"
+ "ldr d4, [x0, #0x48]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x22, [x0, #0x88]\n"
- "smlal v7.4s, v23.4h, v3.4h\n"
- "smlal2 v6.4s, v23.8h, v3.8h\n"
- "add x22, x22, x1\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x24, [x2, #0x88]\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "add x24, x24, x3\n"
"smlal v11.4s, v30.4h, v4.4h\n"
- "smlal2 v21.4s, v30.8h, v4.8h\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v9.4s, v23.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal2 v13.4s, v30.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
"tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
"tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
"tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x24]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x8, #0x50]\n"
+ "ldr d0, [x0, #0x50]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x20, [x0, #0x90]\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v6.4s, v28.8h, v4.8h\n"
- "add x20, x20, x1\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "ldr x15, [x2, #0x90]\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "add x15, x15, x3\n"
"smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v21.4s, v22.8h, v0.8h\n"
- "smlal v14.4s, v25.4h, v0.4h\n"
- "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "smlal2 v13.4s, v22.8h, v0.8h\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "smlal2 v19.4s, v25.8h, v0.8h\n"
"tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x15], #0x4\n"
"tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x15], #0x2\n"
"tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x15]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x15]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x15], #0x2\n"
"tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x15]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x15]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
"ushll v31.8h, v31.8b, #0x0\n"
- "ldr x14, [x0, #0x98]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "add x14, x14, x1\n"
+ "ldr x21, [x2, #0x98]\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "add x21, x21, x3\n"
"tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x14], #0x4\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
"tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x14], #0x2\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
"tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x14]\n"
+ "ld1 { v30.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x14]\n"
+ "ld1 { v30.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x14], #0x2\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
"tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x14]\n"
+ "ld1 { v30.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x14]\n"
+ "ld1 { v30.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x8, #0x58]\n"
+ "ldr d1, [x0, #0x58]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x19, [x0, #0xa0]\n"
- "smlal v7.4s, v30.4h, v0.4h\n"
- "smlal2 v6.4s, v30.8h, v0.8h\n"
- "add x19, x19, x1\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr x14, [x2, #0xa0]\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x3\n"
"smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v21.4s, v25.8h, v1.8h\n"
- "smlal v14.4s, v24.4h, v1.4h\n"
- "smlal2 v10.4s, v24.8h, v1.8h\n"
- "smlal v9.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal2 v13.4s, v25.8h, v1.8h\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v1.8h\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x19], #0x4\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
"tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x19], #0x2\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
"tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x19]\n"
+ "ld1 { v26.b }[6], [x14]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x19]\n"
+ "ld1 { v26.b }[4], [x14]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x19], #0x2\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
"tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x19]\n"
+ "ld1 { v26.b }[2], [x14]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x19]\n"
+ "ld1 { v26.b }[0], [x14]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x8, #0x60]\n"
+ "ldr d2, [x0, #0x60]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x13, [x0, #0xa8]\n"
- "smlal v7.4s, v26.4h, v1.4h\n"
- "smlal2 v6.4s, v26.8h, v1.8h\n"
- "add x13, x13, x1\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x13, [x2, #0xa8]\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "add x13, x13, x3\n"
"smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v21.4s, v24.8h, v2.8h\n"
- "smlal v14.4s, v27.4h, v2.4h\n"
- "smlal2 v10.4s, v27.8h, v2.8h\n"
- "smlal v9.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal2 v13.4s, v24.8h, v2.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"tbz x4, #2, 57f\n"
"ld1 { v25.s }[0], [x13], #0x4\n"
"tbz x4, #1, 56f\n"
@@ -1575,19 +1575,19 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x4, #0, 59f\n"
"ld1 { v25.b }[0], [x13]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x8, #0x68]\n"
+ "ldr d3, [x0, #0x68]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x12, [x0, #0xb0]\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "add x12, x12, x1\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x12, [x2, #0xb0]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x12, x12, x3\n"
"smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v21.4s, v27.8h, v3.8h\n"
- "smlal v14.4s, v23.4h, v3.4h\n"
- "smlal2 v10.4s, v23.8h, v3.8h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v13.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"tbz x4, #2, 61f\n"
"ld1 { v24.s }[0], [x12], #0x4\n"
"tbz x4, #1, 60f\n"
@@ -1609,573 +1609,573 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x4, #0, 63f\n"
"ld1 { v24.b }[0], [x12]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x8, #0x70]\n"
+ "ldr d4, [x0, #0x70]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x11, [x0, #0xb8]\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
- "add x11, x11, x1\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x20, [x2, #0xb8]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x3\n"
"smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v21.4s, v23.8h, v4.8h\n"
- "smlal v14.4s, v28.4h, v4.4h\n"
- "smlal2 v10.4s, v28.8h, v4.8h\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v13.4s, v23.8h, v4.8h\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x11], #0x4\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
"tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x11], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x11]\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x11]\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x11], #0x2\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
"tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x11]\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x11]\n"
+ "ld1 { v22.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x8, #0x78]\n"
+ "ldr d0, [x0, #0x78]\n"
"ushll v22.8h, v22.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x10, [x0, #0xc0]\n"
- "smlal v7.4s, v22.4h, v4.4h\n"
- "smlal2 v6.4s, v22.8h, v4.8h\n"
- "add x10, x10, x1\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "ldr x11, [x2, #0xc0]\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "add x11, x11, x3\n"
"smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "smlal v14.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal2 v13.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
"tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x10], #0x4\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
"tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
"tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x10]\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x10]\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x10], #0x2\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
"tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x10]\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x10]\n"
+ "ld1 { v27.b }[0], [x11]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x9, [x0, #0xc8]\n"
- "smlal v9.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "add x9, x9, x1\n"
+ "ldr x10, [x2, #0xc8]\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "add x10, x10, x3\n"
"tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x9], #0x4\n"
+ "ld1 { v23.s }[0], [x10], #0x4\n"
"tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v23.h }[2], [x10], #0x2\n"
"tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x9]\n"
+ "ld1 { v23.b }[6], [x10]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x9]\n"
+ "ld1 { v23.b }[4], [x10]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x9], #0x2\n"
+ "ld1 { v23.h }[0], [x10], #0x2\n"
"tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x9]\n"
+ "ld1 { v23.b }[2], [x10]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x9]\n"
+ "ld1 { v23.b }[0], [x10]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x8, #0x80]\n"
+ "ldr d1, [x0, #0x80]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x28, [x0, #0xd0]\n"
- "smlal v7.4s, v23.4h, v0.4h\n"
- "smlal2 v6.4s, v23.8h, v0.8h\n"
- "add x28, x28, x1\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr x22, [x2, #0xd0]\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "add x22, x22, x3\n"
"smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v21.4s, v30.8h, v1.8h\n"
- "smlal v14.4s, v26.4h, v1.4h\n"
- "smlal2 v10.4s, v26.8h, v1.8h\n"
- "smlal v9.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal2 v13.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v1.8h\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x28], #0x4\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
"tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
"tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x28]\n"
+ "ld1 { v31.b }[6], [x22]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x28]\n"
+ "ld1 { v31.b }[4], [x22]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x28], #0x2\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
"tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x28]\n"
+ "ld1 { v31.b }[2], [x22]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x28]\n"
+ "ld1 { v31.b }[0], [x22]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x8, #0x88]\n"
+ "ldr d2, [x0, #0x88]\n"
"ushll v31.8h, v31.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x27, [x0, #0xd8]\n"
- "smlal v7.4s, v31.4h, v1.4h\n"
- "smlal2 v6.4s, v31.8h, v1.8h\n"
- "add x27, x27, x1\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x28, [x2, #0xd8]\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x3\n"
"smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v21.4s, v26.8h, v2.8h\n"
- "smlal v14.4s, v25.4h, v2.4h\n"
- "smlal2 v10.4s, v25.8h, v2.8h\n"
- "smlal v9.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal2 v13.4s, v26.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
"tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
"tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x28]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x28]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
"tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x28]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x28]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x8, #0x90]\n"
+ "ldr d3, [x0, #0x90]\n"
"ushll v30.8h, v30.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x26, [x0, #0xe0]\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "smlal2 v6.4s, v30.8h, v2.8h\n"
- "add x26, x26, x1\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x27, [x2, #0xe0]\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "add x27, x27, x3\n"
"smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v21.4s, v25.8h, v3.8h\n"
- "smlal v14.4s, v24.4h, v3.4h\n"
- "smlal2 v10.4s, v24.8h, v3.8h\n"
- "smlal v9.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
"tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
"tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x27]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x27]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
"tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x27]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "ld1 { v28.b }[0], [x27]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x8, #0x98]\n"
+ "ldr d4, [x0, #0x98]\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x25, [x0, #0xe8]\n"
- "smlal v7.4s, v28.4h, v3.4h\n"
- "smlal2 v6.4s, v28.8h, v3.8h\n"
- "add x25, x25, x1\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x26, [x2, #0xe8]\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "add x26, x26, x3\n"
"smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v21.4s, v24.8h, v4.8h\n"
- "smlal v14.4s, v22.4h, v4.4h\n"
- "smlal2 v10.4s, v22.8h, v4.8h\n"
- "smlal v9.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal2 v13.4s, v24.8h, v4.8h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v4.8h\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x25], #0x4\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
"tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
"tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x26]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x26]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x25], #0x2\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
"tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x26]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x25]\n"
+ "ld1 { v26.b }[0], [x26]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x8, #0xa0]\n"
+ "ldr d0, [x0, #0xa0]\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v15.8b\n"
- "ldr x24, [x0, #0xf0]\n"
- "smlal v7.4s, v26.4h, v4.4h\n"
- "smlal2 v6.4s, v26.8h, v4.8h\n"
- "add x24, x24, x1\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "ldr x25, [x2, #0xf0]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x25, x25, x3\n"
"smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v21.4s, v27.8h, v0.8h\n"
- "smlal v14.4s, v23.4h, v0.4h\n"
- "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v13.4s, v27.8h, v0.8h\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "smlal2 v19.4s, v23.8h, v0.8h\n"
"tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
"tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
"tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x25]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
"tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x25]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
"tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
"tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x25]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x25]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
"ushll v25.8h, v25.8b, #0x0\n"
- "ldr x23, [x0, #0xf8]\n"
- "smlal v9.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "add x23, x23, x1\n"
+ "ldr x24, [x2, #0xf8]\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "add x24, x24, x3\n"
"tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
"tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
"tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x24]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x24]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
"tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x24]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x24]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x8, #0xa8]\n"
+ "ldr d1, [x0, #0xa8]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v15.8b\n"
- "ldr x22, [x0, #0x100]\n"
- "smlal v7.4s, v24.4h, v0.4h\n"
- "smlal2 v6.4s, v24.8h, v0.8h\n"
- "add x22, x22, x1\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr x23, [x2, #0x100]\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "add x23, x23, x3\n"
"smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v14.4s, v31.4h, v1.4h\n"
- "smlal2 v10.4s, v31.8h, v1.8h\n"
- "smlal v9.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v13.4s, v23.8h, v1.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v1.8h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
"tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
"tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
"tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
"tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x23]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x8, #0xb0]\n"
+ "ldr d2, [x0, #0xb0]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "usubl v2.8h, v2.8b, v15.8b\n"
- "ldr x7, [x0, #0x108]\n"
- "smlal v7.4s, v27.4h, v1.4h\n"
- "smlal2 v6.4s, v27.8h, v1.8h\n"
- "add x7, x7, x1\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x15, [x2, #0x108]\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "add x15, x15, x3\n"
"smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v14.4s, v30.4h, v2.4h\n"
- "smlal2 v10.4s, v30.8h, v2.8h\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v13.4s, v31.8h, v2.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
"tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x7], #0x4\n"
+ "ld1 { v25.s }[0], [x15], #0x4\n"
"tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x7], #0x2\n"
+ "ld1 { v25.h }[2], [x15], #0x2\n"
"tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x7]\n"
+ "ld1 { v25.b }[6], [x15]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x7]\n"
+ "ld1 { v25.b }[4], [x15]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x7], #0x2\n"
+ "ld1 { v25.h }[0], [x15], #0x2\n"
"tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x7]\n"
+ "ld1 { v25.b }[2], [x15]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x7]\n"
+ "ld1 { v25.b }[0], [x15]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x8, #0xb8]\n"
+ "ldr d3, [x0, #0xb8]\n"
"ushll v25.8h, v25.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v15.8b\n"
- "ldr x20, [x0, #0x110]\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v6.4s, v25.8h, v2.8h\n"
- "add x20, x20, x1\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x21, [x2, #0x110]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x21, x21, x3\n"
"smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v14.4s, v28.4h, v3.4h\n"
- "smlal2 v10.4s, v28.8h, v3.8h\n"
- "smlal v9.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v13.4s, v30.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v28.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
"tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
"tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
"tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x21]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x8, #0xc0]\n"
+ "ldr d4, [x0, #0xc0]\n"
"ushll v24.8h, v24.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr x19, [x0, #0x118]\n"
- "smlal v7.4s, v24.4h, v3.4h\n"
- "smlal2 v6.4s, v24.8h, v3.8h\n"
- "add x19, x19, x1\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x20, [x2, #0x118]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x3\n"
"smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "smlal v14.4s, v26.4h, v4.4h\n"
- "smlal2 v10.4s, v26.8h, v4.8h\n"
- "smlal v9.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v13.4s, v28.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"tbz x4, #2, 113f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"tbz x4, #1, 112f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
"tbz x4, #0, 115f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x4, #0, 115f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x4, #1, 114f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
"tbz x4, #0, 115f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 115f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal v7.4s, v27.4h, v4.4h\n"
- "smlal2 v6.4s, v27.8h, v4.8h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
"tbz x4, #2, 117f\n"
- "ld1 { v17.4s }, [x5], #0x10\n"
- "ld1 { v5.4s }, [x6], #0x10\n"
+ "ld1 { v18.4s }, [x6], #0x10\n"
+ "ld1 { v21.4s }, [x5], #0x10\n"
"tbz x4, #1, 116f\n"
- "ld1 { v18.d }[0], [x5], #0x8\n"
- "ld1 { v29.d }[0], [x6], #0x8\n"
+ "ld1 { v16.d }[0], [x6], #0x8\n"
+ "ld1 { v10.d }[0], [x5], #0x8\n"
"tbz x4, #0, 119f\n"
- "ld1 { v18.s }[2], [x5]\n"
- "ld1 { v29.s }[2], [x6]\n"
+ "ld1 { v16.s }[2], [x6]\n"
+ "ld1 { v10.s }[2], [x5]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x4, #0, 119f\n"
- "ld1 { v18.s }[0], [x5]\n"
- "ld1 { v29.s }[0], [x6]\n"
+ "ld1 { v16.s }[0], [x6]\n"
+ "ld1 { v10.s }[0], [x5]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x4, #1, 118f\n"
- "ld1 { v17.d }[0], [x5], #0x8\n"
- "ld1 { v5.d }[0], [x6], #0x8\n"
+ "ld1 { v18.d }[0], [x6], #0x8\n"
+ "ld1 { v21.d }[0], [x5], #0x8\n"
"tbz x4, #0, 119f\n"
- "ld1 { v17.s }[2], [x5]\n"
- "ld1 { v5.s }[2], [x6]\n"
+ "ld1 { v18.s }[2], [x6]\n"
+ "ld1 { v21.s }[2], [x5]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 119f\n"
- "ld1 { v17.s }[0], [x5]\n"
- "ld1 { v5.s }[0], [x6]\n"
+ "ld1 { v18.s }[0], [x6]\n"
+ "ld1 { v21.s }[0], [x5]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v11.4s, v11.4s, v17.4s\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
- "add x21, x21, x2\n"
- "add x15, x15, x2\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v7.4s, v7.4s, v17.4s\n"
- "add x17, x17, x2\n"
- "add x16, x16, x2\n"
- "and v23.16b, v11.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v18.4s\n"
- "and v22.16b, v14.16b, v5.16b\n"
- "sqrdmulh v10.4s, v10.4s, v18.4s\n"
- "and v17.16b, v9.16b, v5.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "and v31.16b, v11.16b, v21.16b\n"
+ "add x7, x7, x1\n"
+ "add x8, x8, x1\n"
+ "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "add x17, x17, x1\n"
+ "add x16, x16, x1\n"
+ "and v17.16b, v13.16b, v10.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
"sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v20.16b, v7.16b, v5.16b\n"
"sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v19.16b, v21.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v10.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v31.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v26.16b, v8.16b, v29.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v4.16b, v6.16b, v29.16b\n"
- "sqadd v11.4s, v11.4s, v23.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v14.4s, v14.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
+ "and v26.16b, v20.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "and v18.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+ "and v31.16b, v6.16b, v21.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v17.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v7.4s, v7.4s, v20.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "srshl v11.4s, v11.4s, v5.4s\n"
- "sqadd v21.4s, v21.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v5.4s\n"
- "sqadd v10.4s, v10.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v5.4s\n"
- "sqadd v8.4s, v8.4s, v26.4s\n"
- "srshl v7.4s, v7.4s, v5.4s\n"
- "sqadd v6.4s, v6.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
+ "and v27.16b, v19.16b, v10.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v25.16b, v7.16b, v10.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v17.16b, v5.16b, v10.16b\n"
+ "sqadd v20.4s, v20.4s, v26.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v31.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v27.4s\n"
+ "srshl v8.4s, v8.4s, v21.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "srshl v6.4s, v6.4s, v21.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
"sqxtn v11.4h, v11.4s\n"
- "srshl v10.4s, v10.4s, v29.4s\n"
- "sqxtn v14.4h, v14.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v6.4s, v6.4s, v29.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "sqxtn2 v11.8h, v21.4s\n"
- "sqxtn2 v14.8h, v10.4s\n"
- "sqxtn2 v9.8h, v8.4s\n"
- "sqxtn2 v7.8h, v6.4s\n"
- "sqadd v11.8h, v11.8h, v16.8h\n"
- "sqadd v14.8h, v14.8h, v16.8h\n"
- "sqadd v9.8h, v9.8h, v16.8h\n"
- "sqadd v7.8h, v7.8h, v16.8h\n"
- "smax v11.8h, v11.8h, v12.8h\n"
- "smax v14.8h, v14.8h, v12.8h\n"
- "smax v9.8h, v9.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smin v11.8h, v11.8h, v13.8h\n"
- "smin v14.8h, v14.8h, v13.8h\n"
- "smin v9.8h, v9.8h, v13.8h\n"
- "smin v7.8h, v7.8h, v13.8h\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v10.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v5.4s, v5.4s, v10.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v11.8h, v13.4s\n"
+ "sqxtn2 v20.8h, v19.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v11.8h, v11.8h, v15.8h\n"
+ "sqadd v20.8h, v20.8h, v15.8h\n"
+ "sqadd v8.8h, v8.8h, v15.8h\n"
+ "sqadd v6.8h, v6.8h, v15.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v11.8h, v11.8h, v12.8h\n"
+ "smin v20.8h, v20.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
"uzp1 v11.16b, v11.16b, v11.16b\n"
- "uzp1 v14.16b, v14.16b, v14.16b\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
"tbz x4, #2, 121f\n"
- "st1 { v11.s }[0], [x21], #0x4\n"
- "st1 { v14.s }[0], [x15], #0x4\n"
- "st1 { v9.s }[0], [x17], #0x4\n"
- "st1 { v7.s }[0], [x16], #0x4\n"
+ "st1 { v11.s }[0], [x7], #0x4\n"
+ "st1 { v20.s }[0], [x8], #0x4\n"
+ "st1 { v8.s }[0], [x17], #0x4\n"
+ "st1 { v6.s }[0], [x16], #0x4\n"
"tbz x4, #1, 120f\n"
- "st1 { v11.h }[2], [x21], #0x2\n"
- "st1 { v14.h }[2], [x15], #0x2\n"
- "st1 { v9.h }[2], [x17], #0x2\n"
- "st1 { v7.h }[2], [x16], #0x2\n"
+ "st1 { v11.h }[2], [x7], #0x2\n"
+ "st1 { v20.h }[2], [x8], #0x2\n"
+ "st1 { v8.h }[2], [x17], #0x2\n"
+ "st1 { v6.h }[2], [x16], #0x2\n"
"tbz x4, #0, 123f\n"
- "st1 { v11.b }[6], [x21], #0x1\n"
- "st1 { v14.b }[6], [x15], #0x1\n"
- "st1 { v9.b }[6], [x17], #0x1\n"
- "st1 { v7.b }[6], [x16], #0x1\n"
+ "st1 { v11.b }[6], [x7], #0x1\n"
+ "st1 { v20.b }[6], [x8], #0x1\n"
+ "st1 { v8.b }[6], [x17], #0x1\n"
+ "st1 { v6.b }[6], [x16], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x4, #0, 123f\n"
- "st1 { v11.b }[4], [x21], #0x1\n"
- "st1 { v14.b }[4], [x15], #0x1\n"
- "st1 { v9.b }[4], [x17], #0x1\n"
- "st1 { v7.b }[4], [x16], #0x1\n"
+ "st1 { v11.b }[4], [x7], #0x1\n"
+ "st1 { v20.b }[4], [x8], #0x1\n"
+ "st1 { v8.b }[4], [x17], #0x1\n"
+ "st1 { v6.b }[4], [x16], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x4, #1, 122f\n"
- "st1 { v11.h }[0], [x21], #0x2\n"
- "st1 { v14.h }[0], [x15], #0x2\n"
- "st1 { v9.h }[0], [x17], #0x2\n"
- "st1 { v7.h }[0], [x16], #0x2\n"
+ "st1 { v11.h }[0], [x7], #0x2\n"
+ "st1 { v20.h }[0], [x8], #0x2\n"
+ "st1 { v8.h }[0], [x17], #0x2\n"
+ "st1 { v6.h }[0], [x16], #0x2\n"
"tbz x4, #0, 123f\n"
- "st1 { v11.b }[2], [x21], #0x1\n"
- "st1 { v14.b }[2], [x15], #0x1\n"
- "st1 { v9.b }[2], [x17], #0x1\n"
- "st1 { v7.b }[2], [x16], #0x1\n"
+ "st1 { v11.b }[2], [x7], #0x1\n"
+ "st1 { v20.b }[2], [x8], #0x1\n"
+ "st1 { v8.b }[2], [x17], #0x1\n"
+ "st1 { v6.b }[2], [x16], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x4, #0, 123f\n"
- "st1 { v11.b }[0], [x21], #0x1\n"
- "st1 { v14.b }[0], [x15], #0x1\n"
- "st1 { v9.b }[0], [x17], #0x1\n"
- "st1 { v7.b }[0], [x16], #0x1\n"
+ "st1 { v11.b }[0], [x7], #0x1\n"
+ "st1 { v20.b }[0], [x8], #0x1\n"
+ "st1 { v8.b }[0], [x17], #0x1\n"
+ "st1 { v6.b }[0], [x16], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 96cde40e04..4419048793 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,1072 +91,1072 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x7, x6, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v24.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v22.16b }, [x24]\n"
- "ld1r { v12.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v14.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v15.8h }, [x19]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v19.16b, v13.16b\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v11.16b, v26.16b\n"
- "mov v18.16b, v13.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v24.16b, v26.16b\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "mov x8, #0x0\n"
+ "mov x17, #0x0\n"
+ "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x12, x11, [x22, #0x0]\n"
+ "ldp x10, x9, [x22, #0x10]\n"
+ "cbz x7, 3f\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "subs x7, x7, #0x1\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
"mov v9.16b, v13.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v26.16b\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d31, [x24, x8]\n"
+ "ldr d30, [x23, x8]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d29, [x22, x8]\n"
+ "ldr d28, [x21, x8]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ldr d27, [x20, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
"beq 2f\n"
"1:" // Loop
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "add x17, x17, #0x48\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "subs x16, x16, #0x1\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x15, x15, #0x48\n"
+ "subs x7, x7, #0x1\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "add x11, x11, #0x20\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q26, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "ldr q13, [x28, #0x0]\n"
+ "ldr q20, [x28, #0x10]\n"
+ "add x28, x28, #0x20\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
+ "add x17, x17, #0x8\n"
+ "str x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr x19, [x12, #0x20]\n"
- "ldr d31, [x23, x15]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ldr d30, [x22, x15]\n"
- "ldr d29, [x21, x15]\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ldr d28, [x20, x15]\n"
- "ldr d27, [x19, x15]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr d31, [x24, x8]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d30, [x23, x8]\n"
+ "ldr d29, [x22, x8]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ldr d28, [x21, x8]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d27, [x20, x8]\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
"bgt 1b\n"
"2:" // Tail
+ "ldr q17, [x14, #0x0]\n"
+ "ldr q22, [x13, #0x0]\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr x25, [x12, #0x40]\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr q23, [x14, #0x10]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr x21, [x16, #0x28]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr x19, [x12, #0x48]\n"
- "ldr d30, [x19, x15]\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x23, [x12, #0x58]\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x10]\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "ldr d29, [x20, x8]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "ldr x21, [x12, #0x68]\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "ldr x19, [x12, #0x78]\n"
- "ldr q21, [x13, #0x0]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x28, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "ldr q25, [x11, #0x0]\n"
- "ldr q10, [x13, #0x10]\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "ldr q16, [x11, #0x10]\n"
- "tst x8, #0x7\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "tst x6, #0x7\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "add x14, x14, #0x20\n"
"add x13, x13, #0x20\n"
- "add x11, x11, #0x20\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x27, x8]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x15]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x26, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "ldr d28, [x24, x8]\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x25, x8]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x15]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x23, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x15]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x22, x8]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x15]\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "ldr d28, [x19, x15]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x15, x15, #0x8\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x21, x8]\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x20, x8]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "add x8, x8, #0x8\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d13, [x10, x14]\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d13, [x12, x17]\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d19, [x9, x14]\n"
- "str d18, [x28, x14]\n"
- "str d9, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d9, [x11, x17]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str d16, [x10, x17]\n"
+ "str d25, [x9, x17]\n"
+ "add x17, x17, #0x8\n"
"beq 64f\n"
- "add x17, x17, #0x48\n"
+ "add x15, x15, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v13.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v26.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[2], [x19]\n"
+ "ldr x28, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x6, #2, 5f\n"
+ "ld1 { v13.4s }, [x28], #0x10\n"
+ "tbz x6, #1, 4f\n"
+ "ld1 { v20.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[2], [x28]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v26.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v20.s }[0], [x28]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "tbz x6, #1, 6f\n"
+ "ld1 { v13.d }[0], [x28], #0x8\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[2], [x28]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v13.s }[0], [x19]\n"
+ "tbz x6, #0, 7f\n"
+ "ld1 { v13.s }[0], [x28]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "mov v19.16b, v13.16b\n"
- "mov v11.16b, v26.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v18.16b, v13.16b\n"
- "mov v24.16b, v26.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
+ "ldr d0, [x15, #0x0]\n"
+ "ldr d1, [x15, #0x8]\n"
"mov v9.16b, v13.16b\n"
- "mov v23.16b, v26.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ldr x19, [x12, #0x20]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "ld1 { v30.s }[0], [x22], #0x4\n"
- "ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "ld1 { v30.h }[2], [x22], #0x2\n"
- "ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x23]\n"
- "ld1 { v30.b }[6], [x22]\n"
- "ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "mov v18.16b, v20.16b\n"
+ "ldr d2, [x15, #0x10]\n"
+ "ldr d3, [x15, #0x18]\n"
+ "mov v16.16b, v13.16b\n"
+ "mov v26.16b, v20.16b\n"
+ "ldr d4, [x15, #0x20]\n"
+ "ldr d5, [x15, #0x28]\n"
+ "mov v25.16b, v13.16b\n"
+ "mov v10.16b, v20.16b\n"
+ "ldr d6, [x15, #0x30]\n"
+ "ldr d7, [x15, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d8, [x15, #0x40]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v5.8h, v5.8b, v15.8b\n"
+ "ssubl v6.8h, v6.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v8.8h, v8.8b, v15.8b\n"
+ "add x24, x24, x8\n"
+ "add x23, x23, x8\n"
+ "add x22, x22, x8\n"
+ "add x21, x21, x8\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 9f\n"
+ "ld1 { v31.s }[0], [x24], #0x4\n"
+ "ld1 { v30.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 8f\n"
+ "ld1 { v31.h }[2], [x24], #0x2\n"
+ "ld1 { v30.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[6], [x24]\n"
+ "ld1 { v30.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x23]\n"
- "ld1 { v30.b }[4], [x22]\n"
- "ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[4], [x24]\n"
+ "ld1 { v30.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "ld1 { v30.h }[0], [x22], #0x2\n"
- "ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x23]\n"
- "ld1 { v30.b }[2], [x22]\n"
- "ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x6, #1, 10f\n"
+ "ld1 { v31.h }[0], [x24], #0x2\n"
+ "ld1 { v30.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[2], [x24]\n"
+ "ld1 { v30.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x23]\n"
- "ld1 { v30.b }[0], [x22]\n"
- "ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x6, #0, 11f\n"
+ "ld1 { v31.b }[0], [x24]\n"
+ "ld1 { v30.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x21, [x12, #0x28]\n"
- "smlal v19.4s, v31.4h, v3.4h\n"
- "smlal2 v11.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "add x21, x21, x15\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v24.4s, v31.8h, v1.8h\n"
- "smlal v9.4s, v31.4h, v0.4h\n"
- "smlal2 v23.4s, v31.8h, v0.8h\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v20.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "add x21, x21, x8\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v26.4s, v31.8h, v1.8h\n"
+ "smlal v25.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v26.4s, v30.8h, v0.8h\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "smlal v19.4s, v29.4h, v2.4h\n"
- "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "smlal2 v20.4s, v30.8h, v0.8h\n"
+ "usubl v27.8h, v27.8b, v24.8b\n"
+ "smlal v9.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v26.4s, v28.8h, v5.8h\n"
- "smlal v19.4s, v28.4h, v4.4h\n"
- "smlal2 v11.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v28.4h, v2.4h\n"
- "smlal2 v24.4s, v28.8h, v2.8h\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "smlal v25.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "tbz x6, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x6, #1, 12f\n"
"ld1 { v31.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[6], [x21]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[4], [x21]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x6, #1, 14f\n"
"ld1 { v31.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[2], [x21]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x6, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "smlal v18.4s, v31.4h, v6.4h\n"
- "smlal2 v24.4s, v31.8h, v6.8h\n"
- "ldr x20, [x12, #0x30]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x16, #0x30]\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "smlal v19.4s, v27.4h, v6.4h\n"
- "smlal2 v11.4s, v27.8h, v6.8h\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v24.4s, v27.8h, v4.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "tbz x8, #2, 17f\n"
+ "smlal2 v20.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x8\n"
+ "smlal v9.4s, v27.4h, v6.4h\n"
+ "smlal2 v18.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v26.4s, v27.8h, v4.8h\n"
+ "smlal v25.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "tbz x6, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 16f\n"
+ "tbz x6, #1, 16f\n"
"ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 18f\n"
+ "tbz x6, #1, 18f\n"
"ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
+ "tbz x6, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x26, [x12, #0x38]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "add x26, x26, x15\n"
- "tbz x8, #2, 21f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x8, #1, 20f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x28, [x16, #0x38]\n"
+ "smlal v25.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "add x28, x28, x8\n"
+ "tbz x6, #2, 21f\n"
+ "ld1 { v28.s }[0], [x28], #0x4\n"
+ "tbz x6, #1, 20f\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[6], [x28]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[4], [x28]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x8, #1, 22f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x6, #1, 22f\n"
+ "ld1 { v28.h }[0], [x28], #0x2\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[2], [x28]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x6, #0, 23f\n"
+ "ld1 { v28.b }[0], [x28]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x25, [x12, #0x40]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x27, [x16, #0x40]\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v19.4s, v28.4h, v0.4h\n"
- "smlal2 v11.4s, v28.8h, v0.8h\n"
- "add x25, x25, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v31.s }[0], [x25], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v31.h }[2], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[6], [x25]\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v0.4h\n"
+ "smlal2 v18.4s, v28.8h, v0.8h\n"
+ "add x27, x27, x8\n"
+ "tbz x6, #2, 25f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x6, #1, 24f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[4], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v31.h }[0], [x25], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[2], [x25]\n"
+ "tbz x6, #1, 26f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v31.b }[0], [x25]\n"
+ "tbz x6, #0, 27f\n"
+ "ld1 { v31.b }[0], [x27]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr x19, [x12, #0x48]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x26, [x16, #0x48]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v19.4s, v31.4h, v1.4h\n"
- "smlal2 v11.4s, v31.8h, v1.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 29f\n"
- "ld1 { v30.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 28f\n"
- "ld1 { v30.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[6], [x19]\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v9.4s, v31.4h, v1.4h\n"
+ "smlal2 v18.4s, v31.8h, v1.8h\n"
+ "add x26, x26, x8\n"
+ "tbz x6, #2, 29f\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "tbz x6, #1, 28f\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[4], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x8, #1, 30f\n"
- "ld1 { v30.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[2], [x19]\n"
+ "tbz x6, #1, 30f\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
- "ld1 { v30.b }[0], [x19]\n"
+ "tbz x6, #0, 31f\n"
+ "ld1 { v30.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x24, [x12, #0x50]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x25, [x16, #0x50]\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v19.4s, v30.4h, v7.4h\n"
- "smlal2 v11.4s, v30.8h, v7.8h\n"
- "add x24, x24, x15\n"
- "smlal v18.4s, v30.4h, v5.4h\n"
- "smlal2 v24.4s, v30.8h, v5.8h\n"
- "smlal v9.4s, v30.4h, v4.4h\n"
- "smlal2 v23.4s, v30.8h, v4.8h\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "smlal2 v20.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v7.4h\n"
+ "smlal2 v18.4s, v30.8h, v7.8h\n"
+ "add x25, x25, x8\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "smlal v25.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "tbz x6, #2, 33f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x6, #1, 32f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x6, #1, 34f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x6, #0, 35f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x23, [x12, #0x58]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x24, [x16, #0x58]\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v18.4s, v29.4h, v0.4h\n"
- "smlal2 v24.4s, v29.8h, v0.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "smlal2 v20.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v26.4s, v29.8h, v0.8h\n"
+ "add x24, x24, x8\n"
+ "tbz x6, #2, 37f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x6, #1, 36f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x6, #1, 38f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x6, #0, 39f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x22, [x12, #0x60]\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal2 v11.4s, v28.8h, v5.8h\n"
- "smlal v9.4s, v28.4h, v2.4h\n"
- "smlal2 v23.4s, v28.8h, v2.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v31.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v31.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[6], [x22]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "ldr x23, [x16, #0x60]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v18.4s, v28.8h, v5.8h\n"
+ "smlal v25.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "add x23, x23, x8\n"
+ "tbz x6, #2, 41f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "tbz x6, #1, 40f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[4], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v31.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[2], [x22]\n"
+ "tbz x6, #1, 42f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v31.b }[0], [x22]\n"
+ "tbz x6, #0, 43f\n"
+ "ld1 { v31.b }[0], [x23]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr x21, [x12, #0x68]\n"
+ "usubl v31.8h, v31.8b, v24.8b\n"
+ "ldr x22, [x16, #0x68]\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v18.4s, v31.4h, v3.4h\n"
- "smlal2 v24.4s, v31.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v26.4s, v31.8h, v3.8h\n"
+ "add x22, x22, x8\n"
+ "tbz x6, #2, 45f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x6, #1, 44f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x6, #1, 46f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x6, #0, 47f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v19.4s, v30.4h, v8.4h\n"
- "smlal2 v11.4s, v30.8h, v8.8h\n"
- "smlal v9.4s, v30.4h, v5.4h\n"
- "smlal2 v23.4s, v30.8h, v5.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v24.8b\n"
+ "ldr x21, [x16, #0x70]\n"
+ "smlal v9.4s, v30.4h, v8.4h\n"
+ "smlal2 v18.4s, v30.8h, v8.8h\n"
+ "smlal v25.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "add x21, x21, x8\n"
+ "tbz x6, #2, 49f\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "tbz x6, #1, 48f\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x6, #1, 50f\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x6, #0, 51f\n"
+ "ld1 { v29.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x19, [x12, #0x78]\n"
- "smlal v18.4s, v29.4h, v7.4h\n"
- "smlal2 v24.4s, v29.8h, v7.8h\n"
- "smlal v9.4s, v29.4h, v6.4h\n"
- "smlal2 v23.4s, v29.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v28.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v28.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[6], [x19]\n"
+ "usubl v29.8h, v29.8b, v24.8b\n"
+ "ldr x20, [x16, #0x78]\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v26.4s, v29.8h, v7.8h\n"
+ "smlal v25.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "add x20, x20, x8\n"
+ "tbz x6, #2, 53f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x6, #1, 52f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[4], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v28.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[2], [x19]\n"
+ "tbz x6, #1, 54f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v28.b }[0], [x19]\n"
+ "tbz x6, #0, 55f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v18.4s, v28.4h, v8.4h\n"
- "smlal2 v24.4s, v28.8h, v8.8h\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v21.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x11], #0x10\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v10.d }[0], [x13], #0x8\n"
- "ld1 { v16.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v16.s }[2], [x11]\n"
+ "usubl v28.8h, v28.8b, v24.8b\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v26.4s, v28.8h, v8.8h\n"
+ "smlal v25.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "tbz x6, #2, 57f\n"
+ "ld1 { v17.4s }, [x14], #0x10\n"
+ "ld1 { v22.4s }, [x13], #0x10\n"
+ "tbz x6, #1, 56f\n"
+ "ld1 { v23.d }[0], [x14], #0x8\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[2], [x14]\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v10.s }[0], [x13]\n"
- "ld1 { v16.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v23.s }[0], [x14]\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v21.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x11]\n"
+ "tbz x6, #1, 58f\n"
+ "ld1 { v17.d }[0], [x14], #0x8\n"
+ "ld1 { v22.d }[0], [x13], #0x8\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[2], [x14]\n"
+ "ld1 { v22.s }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v21.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x11]\n"
+ "tbz x6, #0, 59f\n"
+ "ld1 { v17.s }[0], [x14]\n"
+ "ld1 { v22.s }[0], [x13]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v21.4s\n"
- "sqrdmulh v19.4s, v19.4s, v21.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v18.4s, v18.4s, v21.4s\n"
- "sqrdmulh v9.4s, v9.4s, v21.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v7.16b, v13.16b, v25.16b\n"
- "sqrdmulh v26.4s, v26.4s, v10.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v11.4s, v11.4s, v10.4s\n"
- "and v21.16b, v18.16b, v25.16b\n"
- "sqrdmulh v24.4s, v24.4s, v10.4s\n"
- "and v20.16b, v9.16b, v25.16b\n"
- "sqrdmulh v23.4s, v23.4s, v10.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v29.16b, v26.16b, v16.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v10.16b, v11.16b, v16.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "and v21.16b, v13.16b, v22.16b\n"
+ "add x12, x12, x17\n"
+ "add x11, x11, x17\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v31.16b, v24.16b, v16.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v23.16b, v16.16b\n"
- "sqadd v13.4s, v13.4s, v7.4s\n"
+ "add x10, x10, x17\n"
+ "add x9, x9, x17\n"
+ "and v29.16b, v20.16b, v19.16b\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v17.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v22.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "and v27.16b, v16.16b, v22.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "and v21.16b, v25.16b, v22.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v19.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v29.16b, v10.16b, v19.16b\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v27.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v21.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v18.4s, v18.4s, v21.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v25.4s\n"
- "sqadd v26.4s, v26.4s, v29.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v11.4s, v11.4s, v10.4s\n"
- "srshl v18.4s, v18.4s, v25.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "srshl v9.4s, v9.4s, v25.4s\n"
- "sqadd v23.4s, v23.4s, v30.4s\n"
- "srshl v26.4s, v26.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "srshl v9.4s, v9.4s, v22.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqadd v10.4s, v10.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
"sqxtn v13.4h, v13.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "sqxtn v19.4h, v19.4s\n"
- "srshl v24.4s, v24.4s, v16.4s\n"
- "sqxtn v18.4h, v18.4s\n"
- "srshl v23.4s, v23.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "sqxtn2 v13.8h, v26.4s\n"
- "sqxtn2 v19.8h, v11.4s\n"
- "sqxtn2 v18.8h, v24.4s\n"
- "sqxtn2 v9.8h, v23.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v13.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v18.4s\n"
+ "sqxtn2 v16.8h, v26.4s\n"
+ "sqxtn2 v25.8h, v10.4s\n"
"sqadd v13.8h, v13.8h, v14.8h\n"
- "sqadd v19.8h, v19.8h, v14.8h\n"
- "sqadd v18.8h, v18.8h, v14.8h\n"
"sqadd v9.8h, v9.8h, v14.8h\n"
- "smax v13.8h, v13.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smax v18.8h, v18.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smin v13.8h, v13.8h, v15.8h\n"
- "smin v19.8h, v19.8h, v15.8h\n"
- "smin v18.8h, v18.8h, v15.8h\n"
- "smin v9.8h, v9.8h, v15.8h\n"
+ "sqadd v16.8h, v16.8h, v14.8h\n"
+ "sqadd v25.8h, v25.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v16.8h, v16.8h, v12.8h\n"
+ "smax v25.8h, v25.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v9.8h, v9.8h, v11.8h\n"
+ "smin v16.8h, v16.8h, v11.8h\n"
+ "smin v25.8h, v25.8h, v11.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "tbz x8, #2, 61f\n"
- "st1 { v13.s }[0], [x10], #0x4\n"
- "st1 { v19.s }[0], [x9], #0x4\n"
- "st1 { v18.s }[0], [x28], #0x4\n"
- "st1 { v9.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 60f\n"
- "st1 { v13.h }[2], [x10], #0x2\n"
- "st1 { v19.h }[2], [x9], #0x2\n"
- "st1 { v18.h }[2], [x28], #0x2\n"
- "st1 { v9.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[6], [x10], #0x1\n"
- "st1 { v19.b }[6], [x9], #0x1\n"
- "st1 { v18.b }[6], [x28], #0x1\n"
- "st1 { v9.b }[6], [x27], #0x1\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "tbz x6, #2, 61f\n"
+ "st1 { v13.s }[0], [x12], #0x4\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v16.s }[0], [x10], #0x4\n"
+ "st1 { v25.s }[0], [x9], #0x4\n"
+ "tbz x6, #1, 60f\n"
+ "st1 { v13.h }[2], [x12], #0x2\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v16.h }[2], [x10], #0x2\n"
+ "st1 { v25.h }[2], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[6], [x12], #0x1\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v16.b }[6], [x10], #0x1\n"
+ "st1 { v25.b }[6], [x9], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[4], [x10], #0x1\n"
- "st1 { v19.b }[4], [x9], #0x1\n"
- "st1 { v18.b }[4], [x28], #0x1\n"
- "st1 { v9.b }[4], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[4], [x12], #0x1\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v16.b }[4], [x10], #0x1\n"
+ "st1 { v25.b }[4], [x9], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "st1 { v13.h }[0], [x10], #0x2\n"
- "st1 { v19.h }[0], [x9], #0x2\n"
- "st1 { v18.h }[0], [x28], #0x2\n"
- "st1 { v9.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[2], [x10], #0x1\n"
- "st1 { v19.b }[2], [x9], #0x1\n"
- "st1 { v18.b }[2], [x28], #0x1\n"
- "st1 { v9.b }[2], [x27], #0x1\n"
+ "tbz x6, #1, 62f\n"
+ "st1 { v13.h }[0], [x12], #0x2\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v16.h }[0], [x10], #0x2\n"
+ "st1 { v25.h }[0], [x9], #0x2\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[2], [x12], #0x1\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v16.b }[2], [x10], #0x1\n"
+ "st1 { v25.b }[2], [x9], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "st1 { v13.b }[0], [x10], #0x1\n"
- "st1 { v19.b }[0], [x9], #0x1\n"
- "st1 { v18.b }[0], [x28], #0x1\n"
- "st1 { v9.b }[0], [x27], #0x1\n"
+ "tbz x6, #0, 63f\n"
+ "st1 { v13.b }[0], [x12], #0x1\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v16.b }[0], [x10], #0x1\n"
+ "st1 { v25.b }[0], [x9], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 079b212e6c..5124b2c8f3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,75 +100,75 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
- "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x19, %[offsetof_Requantize32_minval]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x19, x19, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.16b }, [x24]\n"
- "ld1r { v13.16b }, [x23]\n"
- "lsr x16, x8, #0x3\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v17.8h }, [x20]\n"
- "mov x15, #0x0\n"
- "mov x14, #0x0\n"
- "ld1r { v14.8h }, [x19]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x10, x9, [x22, #0x0]\n"
- "ldp x28, x27, [x22, #0x10]\n"
- "cbz x16, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x16, x16, #0x1\n"
- "mov v9.16b, v15.16b\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
- "mov v16.16b, v10.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v21.16b, v10.16b\n"
- "mov v23.16b, v15.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v18.16b, v10.16b\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d31, [x27, x17]\n"
+ "ldr d30, [x26, x17]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr d28, [x24, x17]\n"
"usubl v31.8h, v31.8b, v12.8b\n"
"usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d27, [x23, x17]\n"
+ "ldr d26, [x22, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr d24, [x20, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
@@ -176,250 +176,250 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x14, x14, #0x48\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "add x17, x17, #0x48\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "subs x16, x16, #0x1\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
+ "subs x8, x8, #0x1\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
"add x13, x13, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "add x11, x11, #0x20\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x14, x14, #0x8\n"
- "ldr q10, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
- "ldr d2, [x17, #0x10]\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "ldr q15, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d3, [x17, #0x18]\n"
- "ldr d4, [x17, #0x20]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d5, [x17, #0x28]\n"
- "ldr d6, [x17, #0x30]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d7, [x17, #0x38]\n"
- "ldr d8, [x17, #0x40]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "ldp x26, x25, [x12, #0x0]\n"
- "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x22, x21, [x12, #0x20]\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d31, [x27, x17]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldr d31, [x26, x15]\n"
- "ldr d30, [x25, x15]\n"
+ "ldr d30, [x26, x17]\n"
+ "ldr d29, [x25, x17]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
- "ldr d29, [x24, x15]\n"
- "ldr d28, [x23, x15]\n"
+ "ldr d28, [x24, x17]\n"
+ "ldr d27, [x23, x17]\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
"usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d27, [x22, x15]\n"
- "ldr d26, [x21, x15]\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr d25, [x21, x17]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d25, [x20, x15]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr d24, [x20, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
@@ -428,966 +428,966 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x19, [x12, #0x58]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x22, [x12, #0x78]\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "ldr d28, [x23, x15]\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr q19, [x13, #0x0]\n"
+ "ldr x23, [x15, #0x78]\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x17]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x24, x15]\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x15]\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x15]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "ldr x21, [x12, #0x80]\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x21, [x15, #0x80]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x20, x15]\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "ldr x20, [x12, #0x88]\n"
- "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x12, #0x70]\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "ldr d28, [x21, x15]\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "ldr q18, [x12, #0x0]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x17]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "ldr x24, [x12, #0x98]\n"
- "ldr d24, [x19, x15]\n"
+ "ldr x25, [x15, #0x98]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "ldr d27, [x22, x15]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x23, x17]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "ldr x22, [x12, #0xa8]\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d26, [x20, x15]\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
+ "ldr d24, [x21, x17]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "ldr x21, [x12, #0xb0]\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x19, x15]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "ldr q30, [x13, #0x10]\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "ldr x19, [x12, #0xc0]\n"
- "ldr q19, [x13, #0x0]\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "ldr d25, [x23, x15]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x24, x17]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x15]\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x25, x17]\n"
+ "ldr q31, [x12, #0x10]\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "ldr q0, [x11, #0x0]\n"
- "ldr q4, [x13, #0x10]\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "ldr q31, [x11, #0x10]\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x15]\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "tst x7, #0x7\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x23, x17]\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr d26, [x21, x15]\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "tst x8, #0x7\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x15]\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "add x11, x11, #0x20\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "ldr d29, [x19, x15]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x20, x17]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "add x17, x17, #0x8\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x15, x15, #0x8\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x10, x14]\n"
+ "str d15, [x11, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str d9, [x9, x14]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d22, [x28, x14]\n"
- "str d23, [x27, x14]\n"
- "add x14, x14, #0x8\n"
+ "str d10, [x10, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str d9, [x9, x16]\n"
+ "str d21, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
"beq 88f\n"
- "add x17, x17, #0x48\n"
+ "add x14, x14, #0x48\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x8, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x8, #1, 4f\n"
- "ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[2], [x19]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v15.4s }, [x24], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v17.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[2], [x24]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v10.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v17.s }[0], [x24]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x8, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x7, #1, 6f\n"
+ "ld1 { v15.d }[0], [x24], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[2], [x24]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v15.s }[0], [x24]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x17, #0x0]\n"
- "ldr d1, [x17, #0x8]\n"
+ "ldr d0, [x14, #0x0]\n"
+ "ldr d1, [x14, #0x8]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "ldr d2, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
"mov v9.16b, v15.16b\n"
- "mov v16.16b, v10.16b\n"
- "ldr d2, [x17, #0x10]\n"
- "ldr d3, [x17, #0x18]\n"
- "mov v22.16b, v15.16b\n"
- "mov v21.16b, v10.16b\n"
- "ldr d4, [x17, #0x20]\n"
- "ldr d5, [x17, #0x28]\n"
- "mov v23.16b, v15.16b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d6, [x17, #0x30]\n"
- "ldr d7, [x17, #0x38]\n"
+ "mov v23.16b, v17.16b\n"
+ "ldr d4, [x14, #0x20]\n"
+ "ldr d5, [x14, #0x28]\n"
+ "mov v21.16b, v15.16b\n"
+ "mov v22.16b, v17.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d8, [x17, #0x40]\n"
- "ldp x26, x25, [x12, #0x0]\n"
+ "ldr d8, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x24, x23, [x12, #0x10]\n"
- "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldp x20, x19, [x12, #0x30]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "add x26, x26, x15\n"
- "add x25, x25, x15\n"
- "add x24, x24, x15\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
- "add x21, x21, x15\n"
- "add x20, x20, x15\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 9f\n"
- "ld1 { v31.s }[0], [x26], #0x4\n"
- "ld1 { v30.s }[0], [x25], #0x4\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 8f\n"
- "ld1 { v31.h }[2], [x26], #0x2\n"
- "ld1 { v30.h }[2], [x25], #0x2\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[6], [x26]\n"
- "ld1 { v30.b }[6], [x25]\n"
- "ld1 { v29.b }[6], [x24]\n"
- "ld1 { v28.b }[6], [x23]\n"
- "ld1 { v27.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
+ "ld1 { v25.b }[6], [x21]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[4], [x26]\n"
- "ld1 { v30.b }[4], [x25]\n"
- "ld1 { v29.b }[4], [x24]\n"
- "ld1 { v28.b }[4], [x23]\n"
- "ld1 { v27.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v25.b }[4], [x21]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x8, #1, 10f\n"
- "ld1 { v31.h }[0], [x26], #0x2\n"
- "ld1 { v30.h }[0], [x25], #0x2\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[2], [x26]\n"
- "ld1 { v30.b }[2], [x25]\n"
- "ld1 { v29.b }[2], [x24]\n"
- "ld1 { v28.b }[2], [x23]\n"
- "ld1 { v27.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 10f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
+ "ld1 { v25.b }[2], [x21]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 11f\n"
- "ld1 { v31.b }[0], [x26]\n"
- "ld1 { v30.b }[0], [x25]\n"
- "ld1 { v29.b }[0], [x24]\n"
- "ld1 { v28.b }[0], [x23]\n"
- "ld1 { v27.b }[0], [x22]\n"
- "ld1 { v26.b }[0], [x21]\n"
- "ld1 { v25.b }[0], [x20]\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v29.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
+ "ld1 { v25.b }[0], [x21]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"usubl v31.8h, v31.8b, v12.8b\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x24, [x12, #0x40]\n"
+ "smlal2 v17.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x15, #0x40]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "smlal2 v10.4s, v30.8h, v0.8h\n"
- "add x24, x24, x15\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "add x24, x24, x17\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v9.4s, v31.4h, v6.4h\n"
- "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "smlal v10.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal2 v17.4s, v29.8h, v1.8h\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v9.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v9.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal2 v17.4s, v25.8h, v4.8h\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v22.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v23.4s, v31.8h, v2.8h\n"
+ "smlal v21.4s, v31.4h, v0.4h\n"
+ "smlal2 v22.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v10.4s, v24.8h, v2.8h\n"
- "smlal v9.4s, v24.4h, v0.4h\n"
- "smlal2 v16.4s, v24.8h, v0.8h\n"
- "tbz x8, #2, 13f\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "tbz x7, #2, 13f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 12f\n"
+ "tbz x7, #1, 12f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x8, #1, 14f\n"
+ "tbz x7, #1, 14f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 15f\n"
+ "tbz x7, #0, 15f\n"
"ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x23, [x12, #0x48]\n"
- "smlal v9.4s, v29.4h, v4.4h\n"
- "smlal2 v16.4s, v29.8h, v4.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 17f\n"
- "ld1 { v28.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 16f\n"
- "ld1 { v28.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[6], [x23]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal v10.4s, v29.4h, v4.4h\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[4], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x8, #1, 18f\n"
- "ld1 { v28.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[2], [x23]\n"
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 19f\n"
- "ld1 { v28.b }[0], [x23]\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x21, [x12, #0x50]\n"
- "smlal v9.4s, v28.4h, v5.4h\n"
- "smlal2 v16.4s, v28.8h, v5.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 21f\n"
+ "ldr x21, [x15, #0x50]\n"
+ "smlal v10.4s, v28.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 20f\n"
+ "tbz x7, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x8, #1, 22f\n"
+ "tbz x7, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 23f\n"
+ "tbz x7, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x19, [x12, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v9.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 25f\n"
- "ld1 { v26.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 24f\n"
- "ld1 { v26.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[6], [x19]\n"
+ "smlal2 v17.4s, v27.8h, v5.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[4], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x8, #1, 26f\n"
- "ld1 { v26.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[2], [x19]\n"
+ "tbz x7, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 27f\n"
- "ld1 { v26.b }[0], [x19]\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v22.4s, v26.4h, v3.4h\n"
- "smlal2 v21.4s, v26.8h, v3.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 29f\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 28f\n"
+ "tbz x7, #1, 28f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x8, #1, 30f\n"
+ "tbz x7, #1, 30f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 31f\n"
+ "tbz x7, #0, 31f\n"
"ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
- "smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v22.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 33f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 32f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "smlal2 v23.4s, v25.8h, v0.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x8, #1, 34f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 34f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 35f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0x70]\n"
- "smlal v22.4s, v29.4h, v4.4h\n"
- "smlal2 v21.4s, v29.8h, v4.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 37f\n"
- "ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 36f\n"
- "ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[6], [x19]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[4], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x8, #1, 38f\n"
- "ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[2], [x19]\n"
+ "tbz x7, #1, 38f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 39f\n"
- "ld1 { v24.b }[0], [x19]\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
"usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x22, [x12, #0x78]\n"
+ "ldr x23, [x15, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
- "smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v22.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 41f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 40f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v17.4s, v24.8h, v7.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal2 v23.4s, v24.8h, v1.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x8, #1, 42f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x7, #1, 42f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 43f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x21, [x12, #0x80]\n"
- "smlal v23.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 45f\n"
+ "ldr x21, [x15, #0x80]\n"
+ "smlal v21.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 45f\n"
"ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 44f\n"
+ "tbz x7, #1, 44f\n"
"ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x8, #1, 46f\n"
+ "tbz x7, #1, 46f\n"
"ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 47f\n"
+ "tbz x7, #0, 47f\n"
"ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr x20, [x12, #0x88]\n"
- "smlal v9.4s, v28.4h, v7.4h\n"
- "smlal2 v16.4s, v28.8h, v7.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 49f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 48f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal v10.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v21.4s, v28.4h, v1.4h\n"
+ "smlal2 v22.4s, v28.8h, v1.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x8, #1, 50f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x7, #1, 50f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 51f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x23, [x12, #0x90]\n"
- "smlal v23.4s, v26.4h, v5.4h\n"
- "smlal2 v18.4s, v26.8h, v5.8h\n"
- "add x23, x23, x15\n"
- "tbz x8, #2, 53f\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "tbz x8, #1, 52f\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[6], [x23]\n"
+ "ldr x24, [x15, #0x90]\n"
+ "smlal v21.4s, v26.4h, v5.4h\n"
+ "smlal2 v22.4s, v26.8h, v5.8h\n"
+ "add x24, x24, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v25.s }[0], [x24], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[6], [x24]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[4], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[4], [x24]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x8, #1, 54f\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[2], [x23]\n"
+ "tbz x7, #1, 54f\n"
+ "ld1 { v25.h }[0], [x24], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[2], [x24]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 55f\n"
- "ld1 { v25.b }[0], [x23]\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v25.b }[0], [x24]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x24, [x12, #0x98]\n"
- "smlal v22.4s, v25.4h, v6.4h\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "add x24, x24, x15\n"
- "tbz x8, #2, 57f\n"
- "ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x8, #1, 56f\n"
- "ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[6], [x24]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v9.4s, v25.4h, v6.4h\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "add x25, x25, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v29.s }[0], [x25], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[6], [x25]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[4], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[4], [x25]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x8, #1, 58f\n"
- "ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[2], [x24]\n"
+ "tbz x7, #1, 58f\n"
+ "ld1 { v29.h }[0], [x25], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[2], [x25]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 59f\n"
- "ld1 { v29.b }[0], [x24]\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v29.b }[0], [x25]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr x19, [x12, #0xa0]\n"
- "smlal v9.4s, v29.4h, v8.4h\n"
- "smlal2 v16.4s, v29.8h, v8.8h\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "smlal2 v18.4s, v29.8h, v2.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 61f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 60f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v10.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v21.4s, v29.4h, v2.4h\n"
+ "smlal2 v22.4s, v29.8h, v2.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x8, #1, 62f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x7, #1, 62f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 63f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
"usubl v27.8h, v27.8b, v12.8b\n"
- "ldr x22, [x12, #0xa8]\n"
- "smlal v22.4s, v27.4h, v7.4h\n"
- "smlal2 v21.4s, v27.8h, v7.8h\n"
- "add x22, x22, x15\n"
- "tbz x8, #2, 65f\n"
- "ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x8, #1, 64f\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[6], [x22]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "smlal v9.4s, v27.4h, v7.4h\n"
+ "smlal2 v23.4s, v27.8h, v7.8h\n"
+ "add x23, x23, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[6], [x23]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[4], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[4], [x23]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x8, #1, 66f\n"
- "ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[2], [x22]\n"
+ "tbz x7, #1, 66f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[2], [x23]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 67f\n"
- "ld1 { v24.b }[0], [x22]\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v24.b }[0], [x23]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
"usubl v24.8h, v24.8b, v12.8b\n"
- "ldr x21, [x12, #0xb0]\n"
- "smlal v22.4s, v24.4h, v5.4h\n"
- "smlal2 v21.4s, v24.8h, v5.8h\n"
- "smlal v23.4s, v24.4h, v3.4h\n"
- "smlal2 v18.4s, v24.8h, v3.8h\n"
- "add x21, x21, x15\n"
- "tbz x8, #2, 69f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x8, #1, 68f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "smlal v9.4s, v24.4h, v5.4h\n"
+ "smlal2 v23.4s, v24.8h, v5.8h\n"
+ "smlal v21.4s, v24.4h, v3.4h\n"
+ "smlal2 v22.4s, v24.8h, v3.8h\n"
+ "add x22, x22, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[6], [x22]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[4], [x22]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x8, #1, 70f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x7, #1, 70f\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[2], [x22]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 71f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v26.b }[0], [x22]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
"usubl v26.8h, v26.8b, v12.8b\n"
- "ldr x20, [x12, #0xb8]\n"
- "smlal v23.4s, v26.4h, v7.4h\n"
- "smlal2 v18.4s, v26.8h, v7.8h\n"
- "add x20, x20, x15\n"
- "tbz x8, #2, 73f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x8, #1, 72f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "smlal v21.4s, v26.4h, v7.4h\n"
+ "smlal2 v22.4s, v26.8h, v7.8h\n"
+ "add x21, x21, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v25.s }[0], [x21], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v25.h }[2], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[6], [x21]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[4], [x21]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x8, #1, 74f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x7, #1, 74f\n"
+ "ld1 { v25.h }[0], [x21], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[2], [x21]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 75f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v25.b }[0], [x21]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
"usubl v25.8h, v25.8b, v12.8b\n"
- "ldr x19, [x12, #0xc0]\n"
- "smlal v22.4s, v25.4h, v8.4h\n"
- "smlal2 v21.4s, v25.8h, v8.8h\n"
- "smlal v23.4s, v25.4h, v6.4h\n"
- "smlal2 v18.4s, v25.8h, v6.8h\n"
- "add x19, x19, x15\n"
- "tbz x8, #2, 77f\n"
- "ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x8, #1, 76f\n"
- "ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[6], [x19]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v9.4s, v25.4h, v8.4h\n"
+ "smlal2 v23.4s, v25.8h, v8.8h\n"
+ "smlal v21.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[4], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x8, #1, 78f\n"
- "ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[2], [x19]\n"
+ "tbz x7, #1, 78f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 79f\n"
- "ld1 { v29.b }[0], [x19]\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v18.4s, v29.8h, v8.8h\n"
- "tbz x8, #2, 81f\n"
+ "smlal v21.4s, v29.4h, v8.4h\n"
+ "smlal2 v22.4s, v29.8h, v8.8h\n"
+ "tbz x7, #2, 81f\n"
"ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v0.4s }, [x11], #0x10\n"
- "tbz x8, #1, 80f\n"
- "ld1 { v4.d }[0], [x13], #0x8\n"
- "ld1 { v31.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[2], [x13]\n"
- "ld1 { v31.s }[2], [x11]\n"
+ "ld1 { v18.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v30.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x12]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x8, #0, 83f\n"
- "ld1 { v4.s }[0], [x13]\n"
- "ld1 { v31.s }[0], [x11]\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v30.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x12]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x8, #1, 82f\n"
+ "tbz x7, #1, 82f\n"
"ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v0.d }[0], [x11], #0x8\n"
- "tbz x8, #0, 83f\n"
+ "ld1 { v18.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[2], [x13]\n"
- "ld1 { v0.s }[2], [x11]\n"
+ "ld1 { v18.s }[2], [x12]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 83f\n"
+ "tbz x7, #0, 83f\n"
"ld1 { v19.s }[0], [x13]\n"
- "ld1 { v0.s }[0], [x11]\n"
+ "ld1 { v18.s }[0], [x12]\n"
"83:" // Oddments: Load requant params: Bit 2: End
"sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "and v0.16b, v15.16b, v18.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v17.4s, v17.4s, v30.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v7.16b, v17.16b, v31.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
"sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "add x10, x10, x14\n"
- "add x9, x9, x14\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "add x28, x28, x14\n"
- "add x27, x27, x14\n"
- "and v30.16b, v15.16b, v0.16b\n"
- "sqrdmulh v10.4s, v10.4s, v4.4s\n"
- "and v28.16b, v9.16b, v0.16b\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "and v29.16b, v22.16b, v0.16b\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "and v20.16b, v23.16b, v0.16b\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v19.16b, v10.16b, v31.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v4.16b, v16.16b, v31.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v5.16b, v21.16b, v31.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v26.16b, v18.16b, v31.16b\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqadd v15.4s, v15.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v18.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "and v27.16b, v9.16b, v18.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "and v0.16b, v21.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqadd v17.4s, v17.4s, v7.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v9.4s, v9.4s, v28.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v29.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v20.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v31.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v31.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v7.16b, v22.16b, v31.16b\n"
"sqadd v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v0.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqadd v21.4s, v21.4s, v5.4s\n"
- "srshl v23.4s, v23.4s, v0.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "srshl v10.4s, v10.4s, v31.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v7.4s\n"
+ "srshl v17.4s, v17.4s, v31.4s\n"
"sqxtn v15.4h, v15.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
"sqxtn v9.4h, v9.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqxtn v22.4h, v22.4s\n"
- "srshl v18.4s, v18.4s, v31.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v15.8h, v10.4s\n"
- "sqxtn2 v9.8h, v16.4s\n"
- "sqxtn2 v22.8h, v21.4s\n"
- "sqxtn2 v23.8h, v18.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v15.8h, v17.4s\n"
+ "sqxtn2 v10.8h, v20.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
"sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v10.8h, v10.8h, v11.8h\n"
"sqadd v9.8h, v9.8h, v11.8h\n"
- "sqadd v22.8h, v22.8h, v11.8h\n"
- "sqadd v23.8h, v23.8h, v11.8h\n"
- "smax v15.8h, v15.8h, v17.8h\n"
- "smax v9.8h, v9.8h, v17.8h\n"
- "smax v22.8h, v22.8h, v17.8h\n"
- "smax v23.8h, v23.8h, v17.8h\n"
+ "sqadd v21.8h, v21.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v16.8h\n"
+ "smax v10.8h, v10.8h, v16.8h\n"
+ "smax v9.8h, v9.8h, v16.8h\n"
+ "smax v21.8h, v21.8h, v16.8h\n"
"smin v15.8h, v15.8h, v14.8h\n"
+ "smin v10.8h, v10.8h, v14.8h\n"
"smin v9.8h, v9.8h, v14.8h\n"
- "smin v22.8h, v22.8h, v14.8h\n"
- "smin v23.8h, v23.8h, v14.8h\n"
+ "smin v21.8h, v21.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "tbz x8, #2, 85f\n"
- "st1 { v15.s }[0], [x10], #0x4\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v15.s }[0], [x11], #0x4\n"
+ "st1 { v10.s }[0], [x10], #0x4\n"
"st1 { v9.s }[0], [x9], #0x4\n"
- "st1 { v22.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
- "tbz x8, #1, 84f\n"
- "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v21.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v15.h }[2], [x11], #0x2\n"
+ "st1 { v10.h }[2], [x10], #0x2\n"
"st1 { v9.h }[2], [x9], #0x2\n"
- "st1 { v22.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v21.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[6], [x11], #0x1\n"
+ "st1 { v10.b }[6], [x10], #0x1\n"
"st1 { v9.b }[6], [x9], #0x1\n"
- "st1 { v22.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "st1 { v21.b }[6], [x28], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[4], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[4], [x11], #0x1\n"
+ "st1 { v10.b }[4], [x10], #0x1\n"
"st1 { v9.b }[4], [x9], #0x1\n"
- "st1 { v22.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "st1 { v21.b }[4], [x28], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x8, #1, 86f\n"
- "st1 { v15.h }[0], [x10], #0x2\n"
+ "tbz x7, #1, 86f\n"
+ "st1 { v15.h }[0], [x11], #0x2\n"
+ "st1 { v10.h }[0], [x10], #0x2\n"
"st1 { v9.h }[0], [x9], #0x2\n"
- "st1 { v22.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v21.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[2], [x11], #0x1\n"
+ "st1 { v10.b }[2], [x10], #0x1\n"
"st1 { v9.b }[2], [x9], #0x1\n"
- "st1 { v22.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "st1 { v21.b }[2], [x28], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x8, #0, 87f\n"
- "st1 { v15.b }[0], [x10], #0x1\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v15.b }[0], [x11], #0x1\n"
+ "st1 { v10.b }[0], [x10], #0x1\n"
"st1 { v9.b }[0], [x9], #0x1\n"
- "st1 { v22.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "st1 { v21.b }[0], [x28], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 082d8dd3e1..1ce037b68c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,2073 +111,2073 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
- "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
- "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
- "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
- "add x14, x10, %[offsetof_Requantize32_minval]\n"
- "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
- "add x5, x10, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v9.16b }, [x17]\n"
- "ld1r { v14.16b }, [x9]\n"
- "lsr x3, x0, #0x3\n"
- "ld1r { v18.8h }, [x4]\n"
- "ld1r { v11.8h }, [x14]\n"
- "mov x24, #0x0\n"
- "mov x22, #0x0\n"
- "ld1r { v13.8h }, [x5]\n"
- "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x20, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x16, x8, [x25, #0x0]\n"
- "ldp x4, x7, [x25, #0x10]\n"
- "cbz x3, 3f\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "subs x3, x3, #0x1\n"
- "mov v17.16b, v15.16b\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v8.16b, v16.16b\n"
- "mov v10.16b, v15.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v7.16b, v16.16b\n"
- "mov v6.16b, v15.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v5.16b, v16.16b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x3, x13, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v9.16b }, [x3]\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x11, x13, %[offsetof_Requantize32_b_offset]\n"
+ "add x5, x13, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x11]\n"
+ "ld1r { v14.8h }, [x5]\n"
+ "add x3, x13, %[offsetof_Requantize32_minval]\n"
+ "add x15, x13, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.8h }, [x3]\n"
+ "ld1r { v11.8h }, [x15]\n"
+ "mov x0, #0x0\n"
+ "mov x10, #0x0\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x6, [x24, #0x0]\n"
+ "ldp x7, x16, [x24, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d31, [x9, x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d30, [x28, x0]\n"
+ "ldr d29, [x27, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d28, [x26, x0]\n"
+ "ldr d27, [x25, x0]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d23, [x24, x0]\n"
+ "ldr d25, [x23, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d24, [x22, x0]\n"
+ "ldr d26, [x21, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
+ "ldr d22, [x20, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr q12, [x10, #0x0]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q19, [x1, #0x0]\n"
- "ldr q20, [x10, #0x10]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q29, [x1, #0x10]\n"
- "subs x3, x3, #0x1\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "add x10, x10, #0x20\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x14, [x20, #0x110]\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "add x23, x23, #0xc8\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "add x3, x3, #0xc8\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x19, #0x0]\n"
- "add x22, x22, #0x8\n"
- "ldr q16, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "ldr d2, [x23, #0x10]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d3, [x23, #0x18]\n"
- "ldr d4, [x23, #0x20]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ldr d31, [x28, x24]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr d30, [x6, x24]\n"
- "ldr d29, [x26, x24]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "ldr q13, [x13, #0x0]\n"
+ "ldr q19, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "add x10, x10, #0x8\n"
+ "str x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d31, [x9, x0]\n"
+ "ldr d30, [x28, x0]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d29, [x27, x0]\n"
+ "ldr d28, [x26, x0]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d28, [x25, x24]\n"
- "ldr d27, [x5, x24]\n"
+ "ldr d27, [x25, x0]\n"
+ "ldr d23, [x24, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d23, [x2, x24]\n"
- "ldr d25, [x27, x24]\n"
+ "ldr d25, [x23, x0]\n"
+ "ldr d24, [x22, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d24, [x21, x24]\n"
- "ldr d26, [x12, x24]\n"
+ "ldr d26, [x21, x0]\n"
+ "ldr d22, [x20, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr d22, [x19, x24]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr x19, [x20, #0x50]\n"
- "ldr d31, [x19, x24]\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
- "ldr x15, [x20, #0x58]\n"
+ "ldr q18, [x5, #0x0]\n"
+ "ldr q6, [x8, #0x0]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr q5, [x5, #0x10]\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "ldr x19, [x20, #0x60]\n"
- "ldr x27, [x20, #0x68]\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x5, [x20, #0x70]\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d30, [x15, x24]\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x22, x0]\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
- "ldr d0, [x23, #0x28]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "ldr x12, [x20, #0x80]\n"
- "ldr x26, [x20, #0x88]\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "ldr x25, [x4, #0x78]\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x14, [x20, #0x90]\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "ldr x24, [x4, #0x88]\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x21, x0]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
- "ldr d1, [x23, #0x30]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "ldr x21, [x20, #0xa0]\n"
- "ldr x2, [x20, #0xa8]\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x13, [x20, #0xb0]\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d25, [x27, x24]\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x20, x0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
- "ldr d2, [x23, #0x38]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "ldr x19, [x20, #0xc0]\n"
- "ldr x28, [x20, #0xc8]\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x6, [x20, #0xd0]\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d24, [x5, x24]\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x26, x0]\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
- "ldr d3, [x23, #0x40]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "ldr d27, [x11, x24]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x25, x0]\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "ldr x11, [x20, #0xe0]\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0x48]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
- "ldr x5, [x20, #0xf0]\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "ldr q12, [x10, #0x0]\n"
- "ldr q19, [x1, #0x0]\n"
- "smlal2 v7.4s, v22.8h, v0.8h\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "ldr q20, [x10, #0x10]\n"
- "ldr q29, [x1, #0x10]\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d28, [x26, x24]\n"
- "ldr d0, [x23, #0x50]\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x24, x0]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "tst x0, #0x7\n"
- "smlal2 v7.4s, v25.8h, v1.8h\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "add x10, x10, #0x20\n"
- "add x1, x1, #0x20\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "ldr d23, [x12, x24]\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "tst x1, #0x7\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "ldr d23, [x23, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
- "ldr d1, [x23, #0x58]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal2 v7.4s, v24.8h, v2.8h\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d31, [x14, x24]\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x15, x0]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
- "ldr d2, [x23, #0x60]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal2 v7.4s, v27.8h, v3.8h\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "ldr d30, [x15, x24]\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr d30, [x21, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
- "ldr d3, [x23, #0x68]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "ldr d26, [x21, x24]\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x20, x0]\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x0]\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x23, #0x70]\n"
- "ldr d22, [x9, x24]\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v7.4s, v31.8h, v0.8h\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "ldr d25, [x2, x24]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x13, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
- "ldr d0, [x23, #0x78]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal2 v7.4s, v30.8h, v1.8h\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "ldr d24, [x13, x24]\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x12, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
- "ldr d1, [x23, #0x80]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal2 v7.4s, v26.8h, v2.8h\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d27, [x19, x24]\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x11, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "ldr d2, [x23, #0x88]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "ldr d23, [x28, x24]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "ldr d23, [x22, x0]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "ldr d3, [x23, #0x90]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "ldr d28, [x11, x24]\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x9, x0]\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x27, x0]\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "ldr d31, [x6, x24]\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
- "ldr d4, [x23, #0x98]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal2 v7.4s, v27.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "ldr d30, [x27, x24]\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x28, x0]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr d0, [x23, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal2 v7.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d26, [x17, x24]\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "ldr d26, [x26, x0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr d1, [x23, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal2 v7.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d25, [x5, x24]\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x25, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr d2, [x23, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal2 v7.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "ldr d24, [x25, x24]\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr d24, [x24, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr d3, [x23, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x23, x0]\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "ldr d27, [x26, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x23, #0xc0]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "ldr q22, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "ldr d25, [x12, x24]\n"
+ "ldr d25, [x15, x0]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "ldr d24, [x14, x24]\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ldr d24, [x21, x0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x24]\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ldr d27, [x20, x0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x24, x24, #0x8\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "add x0, x0, #0x8\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d13, [x17, x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d15, [x16, x22]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d17, [x8, x22]\n"
- "str d10, [x4, x22]\n"
- "str d6, [x7, x22]\n"
- "add x22, x22, #0x8\n"
+ "str d20, [x6, x10]\n"
+ "str d8, [x7, x10]\n"
+ "str d17, [x16, x10]\n"
+ "add x10, x10, #0x8\n"
"beq 124f\n"
- "add x23, x23, #0xc8\n"
+ "add x3, x3, #0xc8\n"
"3:" // Oddments
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x0, #2, 5f\n"
- "ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x0, #1, 4f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v13.4s }, [x13], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[2], [x13]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v19.s }[0], [x13]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x0, #1, 6f\n"
- "ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[2], [x19]\n"
+ "tbz x1, #1, 6f\n"
+ "ld1 { v13.d }[0], [x13], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[2], [x13]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 7f\n"
- "ld1 { v15.s }[0], [x19]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v13.s }[0], [x13]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d0, [x23, #0x0]\n"
- "ldr d1, [x23, #0x8]\n"
- "mov v17.16b, v15.16b\n"
- "mov v8.16b, v16.16b\n"
- "ldr d2, [x23, #0x10]\n"
- "ldr d3, [x23, #0x18]\n"
- "mov v10.16b, v15.16b\n"
- "mov v7.16b, v16.16b\n"
- "ldr d4, [x23, #0x20]\n"
- "ldp x28, x6, [x20, #0x0]\n"
- "mov v6.16b, v15.16b\n"
- "mov v5.16b, v16.16b\n"
- "ldp x26, x25, [x20, #0x10]\n"
- "ldp x5, x2, [x20, #0x20]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x27, x21, [x20, #0x30]\n"
- "ldp x12, x19, [x20, #0x40]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "add x28, x28, x24\n"
- "add x6, x6, x24\n"
- "add x26, x26, x24\n"
- "add x25, x25, x24\n"
- "add x5, x5, x24\n"
- "add x2, x2, x24\n"
- "add x27, x27, x24\n"
- "add x21, x21, x24\n"
- "add x12, x12, x24\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 9f\n"
- "ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x6], #0x4\n"
- "ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x25], #0x4\n"
- "ld1 { v27.s }[0], [x5], #0x4\n"
- "ld1 { v23.s }[0], [x2], #0x4\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x12], #0x4\n"
- "ld1 { v22.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 8f\n"
- "ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x6], #0x2\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x5], #0x2\n"
- "ld1 { v23.h }[2], [x2], #0x2\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x12], #0x2\n"
- "ld1 { v22.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x6]\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x5]\n"
- "ld1 { v23.b }[6], [x2]\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x12]\n"
- "ld1 { v22.b }[6], [x19]\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v20.16b, v13.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "mov v21.16b, v19.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "add x9, x9, x0\n"
+ "add x28, x28, x0\n"
+ "add x27, x27, x0\n"
+ "add x26, x26, x0\n"
+ "add x25, x25, x0\n"
+ "add x24, x24, x0\n"
+ "add x23, x23, x0\n"
+ "add x22, x22, x0\n"
+ "add x21, x21, x0\n"
+ "add x20, x20, x0\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "ld1 { v29.s }[0], [x27], #0x4\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x6]\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x5]\n"
- "ld1 { v23.b }[4], [x2]\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x12]\n"
- "ld1 { v22.b }[4], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x0, #1, 10f\n"
- "ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x6], #0x2\n"
- "ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x25], #0x2\n"
- "ld1 { v27.h }[0], [x5], #0x2\n"
- "ld1 { v23.h }[0], [x2], #0x2\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x12], #0x2\n"
- "ld1 { v22.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x6]\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x5]\n"
- "ld1 { v23.b }[2], [x2]\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x12]\n"
- "ld1 { v22.b }[2], [x19]\n"
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "ld1 { v29.h }[0], [x27], #0x2\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 11f\n"
- "ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x6]\n"
- "ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x25]\n"
- "ld1 { v27.b }[0], [x5]\n"
- "ld1 { v23.b }[0], [x2]\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x12]\n"
- "ld1 { v22.b }[0], [x19]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "ld1 { v29.b }[0], [x27]\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "ld1 { v27.b }[0], [x25]\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v22.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x19, [x20, #0x50]\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x4, #0x50]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "smlal v10.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v8.4s, v29.4h, v0.4h\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "add x19, x19, x24\n"
+ "add x20, x20, x0\n"
"smlal2 v7.4s, v29.8h, v0.8h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v1.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "smlal2 v8.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "smlal v8.4s, v28.4h, v1.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
"smlal2 v7.4s, v28.8h, v1.8h\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v16.4s, v27.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v19.4s, v27.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
"smlal2 v7.4s, v23.8h, v2.8h\n"
- "tbz x0, #2, 13f\n"
- "ld1 { v31.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 12f\n"
- "ld1 { v31.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[6], [x19]\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[4], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x0, #1, 14f\n"
- "ld1 { v31.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[2], [x19]\n"
+ "tbz x1, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 15f\n"
- "ld1 { v31.b }[0], [x19]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x58]\n"
- "smlal v6.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "add x15, x15, x24\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "add x22, x22, x0\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v31.4h, v3.4h\n"
"smlal2 v7.4s, v31.8h, v3.8h\n"
- "tbz x0, #2, 17f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 16f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[6], [x22]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[4], [x22]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x0, #1, 18f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 18f\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[2], [x22]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 19f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v30.b }[0], [x22]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x19, [x20, #0x60]\n"
- "smlal v6.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "tbz x0, #2, 21f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 20f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x0, #1, 22f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 23f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d0, [x3, #0x28]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d0, [x23, #0x28]\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v27.8h, v4.8h\n"
- "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal v20.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal v8.4s, v30.4h, v4.4h\n"
"smlal2 v7.4s, v30.8h, v4.8h\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x27, [x20, #0x68]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v16.4s, v29.8h, v0.8h\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v8.4s, v28.8h, v0.8h\n"
- "smlal v10.4s, v22.4h, v0.4h\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v29.4h, v0.4h\n"
+ "smlal2 v19.4s, v29.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "smlal v8.4s, v22.4h, v0.4h\n"
"smlal2 v7.4s, v22.8h, v0.8h\n"
- "tbz x0, #2, 25f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 24f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[6], [x27]\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[4], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x0, #1, 26f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[2], [x27]\n"
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 27f\n"
- "ld1 { v25.b }[0], [x27]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x23, #0x30]\n"
+ "ldr d1, [x3, #0x30]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x5, [x20, #0x70]\n"
- "smlal v6.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v16.4s, v28.8h, v1.8h\n"
- "smlal v17.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v25.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v20.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v25.4h, v1.4h\n"
"smlal2 v7.4s, v25.8h, v1.8h\n"
- "tbz x0, #2, 29f\n"
- "ld1 { v24.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 28f\n"
- "ld1 { v24.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[6], [x5]\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[4], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x0, #1, 30f\n"
- "ld1 { v24.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[2], [x5]\n"
+ "tbz x1, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 31f\n"
- "ld1 { v24.b }[0], [x5]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x23, #0x38]\n"
+ "ldr d2, [x3, #0x38]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x11, [x20, #0x78]\n"
- "smlal v6.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v16.4s, v23.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v24.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v23.4h, v2.4h\n"
+ "smlal2 v19.4s, v23.8h, v2.8h\n"
+ "smlal v20.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
"smlal2 v7.4s, v24.8h, v2.8h\n"
- "tbz x0, #2, 33f\n"
- "ld1 { v27.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 32f\n"
- "ld1 { v27.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[6], [x11]\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v27.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[6], [x25]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[4], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[4], [x25]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x0, #1, 34f\n"
- "ld1 { v27.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[2], [x11]\n"
+ "tbz x1, #1, 34f\n"
+ "ld1 { v27.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[2], [x25]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 35f\n"
- "ld1 { v27.b }[0], [x11]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v27.b }[0], [x25]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x23, #0x40]\n"
+ "ldr d3, [x3, #0x40]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x12, [x20, #0x80]\n"
- "smlal v6.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v16.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v27.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x23, [x4, #0x80]\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v31.4h, v3.4h\n"
+ "smlal2 v19.4s, v31.8h, v3.8h\n"
+ "smlal v20.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v27.4h, v3.4h\n"
"smlal2 v7.4s, v27.8h, v3.8h\n"
- "tbz x0, #2, 37f\n"
- "ld1 { v23.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 36f\n"
- "ld1 { v23.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[6], [x12]\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[6], [x23]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[4], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[4], [x23]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x0, #1, 38f\n"
- "ld1 { v23.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[2], [x12]\n"
+ "tbz x1, #1, 38f\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[2], [x23]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 39f\n"
- "ld1 { v23.b }[0], [x12]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v23.b }[0], [x23]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x23, #0x48]\n"
+ "ldr d4, [x3, #0x48]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x26, [x20, #0x88]\n"
- "smlal v6.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v16.4s, v30.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v23.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x24, [x4, #0x88]\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "add x24, x24, x0\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v19.4s, v30.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
"smlal2 v7.4s, v23.8h, v4.8h\n"
- "tbz x0, #2, 41f\n"
- "ld1 { v28.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 40f\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[6], [x26]\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[6], [x24]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[4], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[4], [x24]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x0, #1, 42f\n"
- "ld1 { v28.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[2], [x26]\n"
+ "tbz x1, #1, 42f\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[2], [x24]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 43f\n"
- "ld1 { v28.b }[0], [x26]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v28.b }[0], [x24]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x23, #0x50]\n"
+ "ldr d0, [x3, #0x50]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x14, [x20, #0x90]\n"
- "smlal v6.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v16.4s, v22.8h, v0.8h\n"
- "smlal v17.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "tbz x0, #2, 45f\n"
- "ld1 { v31.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 44f\n"
- "ld1 { v31.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[6], [x14]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x15, [x4, #0x90]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v22.4h, v0.4h\n"
+ "smlal2 v19.4s, v22.8h, v0.8h\n"
+ "smlal v20.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x15]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[4], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x15]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x0, #1, 46f\n"
- "ld1 { v31.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[2], [x14]\n"
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x15]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 47f\n"
- "ld1 { v31.b }[0], [x14]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x15]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x15, [x20, #0x98]\n"
- "smlal v10.4s, v31.4h, v0.4h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "smlal v8.4s, v31.4h, v0.4h\n"
"smlal2 v7.4s, v31.8h, v0.8h\n"
- "add x15, x15, x24\n"
- "tbz x0, #2, 49f\n"
- "ld1 { v30.s }[0], [x15], #0x4\n"
- "tbz x0, #1, 48f\n"
- "ld1 { v30.h }[2], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[6], [x15]\n"
+ "add x21, x21, x0\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[4], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x0, #1, 50f\n"
- "ld1 { v30.h }[0], [x15], #0x2\n"
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[2], [x15]\n"
+ "tbz x1, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 51f\n"
- "ld1 { v30.b }[0], [x15]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x23, #0x58]\n"
+ "ldr d1, [x3, #0x58]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x21, [x20, #0xa0]\n"
- "smlal v6.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v16.4s, v25.8h, v1.8h\n"
- "smlal v17.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "smlal v10.4s, v30.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x0\n"
+ "smlal v13.4s, v25.4h, v1.4h\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "smlal v20.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal v8.4s, v30.4h, v1.4h\n"
"smlal2 v7.4s, v30.8h, v1.8h\n"
- "tbz x0, #2, 53f\n"
- "ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 52f\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[6], [x21]\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[4], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x0, #1, 54f\n"
- "ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[2], [x21]\n"
+ "tbz x1, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 55f\n"
- "ld1 { v26.b }[0], [x21]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x23, #0x60]\n"
+ "ldr d2, [x3, #0x60]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x2, [x20, #0xa8]\n"
- "smlal v6.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "add x2, x2, x24\n"
- "smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v16.4s, v24.8h, v2.8h\n"
- "smlal v17.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "smlal v10.4s, v26.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "add x13, x13, x0\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v19.4s, v24.8h, v2.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
"smlal2 v7.4s, v26.8h, v2.8h\n"
- "tbz x0, #2, 57f\n"
- "ld1 { v25.s }[0], [x2], #0x4\n"
- "tbz x0, #1, 56f\n"
- "ld1 { v25.h }[2], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[6], [x2]\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v25.s }[0], [x13], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v25.h }[2], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[6], [x13]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[4], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[4], [x13]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x0, #1, 58f\n"
- "ld1 { v25.h }[0], [x2], #0x2\n"
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[2], [x2]\n"
+ "tbz x1, #1, 58f\n"
+ "ld1 { v25.h }[0], [x13], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[2], [x13]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 59f\n"
- "ld1 { v25.b }[0], [x2]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v25.b }[0], [x13]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x23, #0x68]\n"
+ "ldr d3, [x3, #0x68]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x13, [x20, #0xb0]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x13, x13, x24\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v16.4s, v27.8h, v3.8h\n"
- "smlal v17.4s, v23.4h, v3.4h\n"
- "smlal2 v8.4s, v23.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x12, x12, x0\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v19.4s, v27.8h, v3.8h\n"
+ "smlal v20.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 61f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x0, #1, 60f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v24.s }[0], [x12], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v24.h }[2], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[6], [x12]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[4], [x12]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x0, #1, 62f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x1, #1, 62f\n"
+ "ld1 { v24.h }[0], [x12], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[2], [x12]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 63f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v24.b }[0], [x12]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x23, #0x70]\n"
+ "ldr d4, [x3, #0x70]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x9, [x20, #0xb8]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x9, x9, x24\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v16.4s, v23.8h, v4.8h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v23.4h, v4.4h\n"
+ "smlal2 v19.4s, v23.8h, v4.8h\n"
+ "smlal v20.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 65f\n"
- "ld1 { v22.s }[0], [x9], #0x4\n"
- "tbz x0, #1, 64f\n"
- "ld1 { v22.h }[2], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[6], [x9]\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[4], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x0, #1, 66f\n"
- "ld1 { v22.h }[0], [x9], #0x2\n"
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[2], [x9]\n"
+ "tbz x1, #1, 66f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 67f\n"
- "ld1 { v22.b }[0], [x9]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x23, #0x78]\n"
+ "ldr d0, [x3, #0x78]\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x19, [x20, #0xc0]\n"
- "smlal v6.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "add x19, x19, x24\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v16.4s, v31.8h, v0.8h\n"
- "smlal v17.4s, v30.4h, v0.4h\n"
- "smlal2 v8.4s, v30.8h, v0.8h\n"
- "tbz x0, #2, 69f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
- "tbz x0, #1, 68f\n"
- "ld1 { v27.h }[2], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[6], [x19]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "add x11, x11, x0\n"
+ "smlal v13.4s, v31.4h, v0.4h\n"
+ "smlal2 v19.4s, v31.8h, v0.8h\n"
+ "smlal v20.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[4], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x0, #1, 70f\n"
- "ld1 { v27.h }[0], [x19], #0x2\n"
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[2], [x19]\n"
+ "tbz x1, #1, 70f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 71f\n"
- "ld1 { v27.b }[0], [x19]\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x28, [x20, #0xc8]\n"
- "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr x22, [x4, #0xc8]\n"
+ "smlal v8.4s, v27.4h, v0.4h\n"
"smlal2 v7.4s, v27.8h, v0.8h\n"
- "add x28, x28, x24\n"
- "tbz x0, #2, 73f\n"
- "ld1 { v23.s }[0], [x28], #0x4\n"
- "tbz x0, #1, 72f\n"
- "ld1 { v23.h }[2], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[6], [x28]\n"
+ "add x22, x22, x0\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v23.s }[0], [x22], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[6], [x22]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[4], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[4], [x22]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x0, #1, 74f\n"
- "ld1 { v23.h }[0], [x28], #0x2\n"
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[2], [x28]\n"
+ "tbz x1, #1, 74f\n"
+ "ld1 { v23.h }[0], [x22], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[2], [x22]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 75f\n"
- "ld1 { v23.b }[0], [x28]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v23.b }[0], [x22]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x23, #0x80]\n"
+ "ldr d1, [x3, #0x80]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x6, [x20, #0xd0]\n"
- "smlal v6.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "add x6, x6, x24\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v16.4s, v30.8h, v1.8h\n"
- "smlal v17.4s, v26.4h, v1.4h\n"
- "smlal2 v8.4s, v26.8h, v1.8h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "add x9, x9, x0\n"
+ "smlal v13.4s, v30.4h, v1.4h\n"
+ "smlal2 v19.4s, v30.8h, v1.8h\n"
+ "smlal v20.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
"smlal2 v7.4s, v23.8h, v1.8h\n"
- "tbz x0, #2, 77f\n"
- "ld1 { v31.s }[0], [x6], #0x4\n"
- "tbz x0, #1, 76f\n"
- "ld1 { v31.h }[2], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[6], [x6]\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[6], [x9]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[4], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[4], [x9]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x0, #1, 78f\n"
- "ld1 { v31.h }[0], [x6], #0x2\n"
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[2], [x6]\n"
+ "tbz x1, #1, 78f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[2], [x9]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 79f\n"
- "ld1 { v31.b }[0], [x6]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v31.b }[0], [x9]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x23, #0x88]\n"
+ "ldr d2, [x3, #0x88]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x27, [x20, #0xd8]\n"
- "smlal v6.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "add x27, x27, x24\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v16.4s, v26.8h, v2.8h\n"
- "smlal v17.4s, v25.4h, v2.4h\n"
- "smlal2 v8.4s, v25.8h, v2.8h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x0\n"
+ "smlal v13.4s, v26.4h, v2.4h\n"
+ "smlal2 v19.4s, v26.8h, v2.8h\n"
+ "smlal v20.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
"smlal2 v7.4s, v31.8h, v2.8h\n"
- "tbz x0, #2, 81f\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "tbz x0, #1, 80f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x0, #1, 82f\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "tbz x1, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 83f\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x23, #0x90]\n"
+ "ldr d3, [x3, #0x90]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x11, [x20, #0xe0]\n"
- "smlal v6.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "add x11, x11, x24\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v16.4s, v25.8h, v3.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal2 v8.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "add x27, x27, x0\n"
+ "smlal v13.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
"smlal2 v7.4s, v30.8h, v3.8h\n"
- "tbz x0, #2, 85f\n"
- "ld1 { v28.s }[0], [x11], #0x4\n"
- "tbz x0, #1, 84f\n"
- "ld1 { v28.h }[2], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[6], [x11]\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[6], [x27]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[4], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[4], [x27]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x0, #1, 86f\n"
- "ld1 { v28.h }[0], [x11], #0x2\n"
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[2], [x11]\n"
+ "tbz x1, #1, 86f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[2], [x27]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 87f\n"
- "ld1 { v28.b }[0], [x11]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v28.b }[0], [x27]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x23, #0x98]\n"
+ "ldr d4, [x3, #0x98]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x17, [x20, #0xe8]\n"
- "smlal v6.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "add x17, x17, x24\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v16.4s, v24.8h, v4.8h\n"
- "smlal v17.4s, v22.4h, v4.4h\n"
- "smlal2 v8.4s, v22.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "add x26, x26, x0\n"
+ "smlal v13.4s, v24.4h, v4.4h\n"
+ "smlal2 v19.4s, v24.8h, v4.8h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
"smlal2 v7.4s, v28.8h, v4.8h\n"
- "tbz x0, #2, 89f\n"
- "ld1 { v26.s }[0], [x17], #0x4\n"
- "tbz x0, #1, 88f\n"
- "ld1 { v26.h }[2], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[6], [x17]\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[6], [x26]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[4], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[4], [x26]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x0, #1, 90f\n"
- "ld1 { v26.h }[0], [x17], #0x2\n"
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[2], [x17]\n"
+ "tbz x1, #1, 90f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[2], [x26]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 91f\n"
- "ld1 { v26.b }[0], [x17]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v26.b }[0], [x26]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x23, #0xa0]\n"
+ "ldr d0, [x3, #0xa0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "ldr x5, [x20, #0xf0]\n"
- "smlal v6.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "add x5, x5, x24\n"
- "smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v16.4s, v27.8h, v0.8h\n"
- "smlal v17.4s, v23.4h, v0.4h\n"
- "smlal2 v8.4s, v23.8h, v0.8h\n"
- "tbz x0, #2, 93f\n"
- "ld1 { v25.s }[0], [x5], #0x4\n"
- "tbz x0, #1, 92f\n"
- "ld1 { v25.h }[2], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[6], [x5]\n"
+ "ssubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "add x25, x25, x0\n"
+ "smlal v13.4s, v27.4h, v0.4h\n"
+ "smlal2 v19.4s, v27.8h, v0.8h\n"
+ "smlal v20.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[6], [x25]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[4], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[4], [x25]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x0, #1, 94f\n"
- "ld1 { v25.h }[0], [x5], #0x2\n"
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[2], [x5]\n"
+ "tbz x1, #1, 94f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[2], [x25]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 95f\n"
- "ld1 { v25.b }[0], [x5]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v25.b }[0], [x25]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x25, [x20, #0xf8]\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
"smlal2 v7.4s, v25.8h, v0.8h\n"
- "add x25, x25, x24\n"
- "tbz x0, #2, 97f\n"
- "ld1 { v24.s }[0], [x25], #0x4\n"
- "tbz x0, #1, 96f\n"
- "ld1 { v24.h }[2], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[6], [x25]\n"
+ "add x24, x24, x0\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[6], [x24]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[4], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[4], [x24]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x0, #1, 98f\n"
- "ld1 { v24.h }[0], [x25], #0x2\n"
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "tbz x1, #1, 98f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[2], [x24]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 99f\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v24.b }[0], [x24]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x23, #0xa8]\n"
+ "ldr d1, [x3, #0xa8]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "ldr x26, [x20, #0x100]\n"
- "smlal v6.4s, v24.4h, v0.4h\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
- "add x26, x26, x24\n"
- "smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v16.4s, v23.8h, v1.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v8.4s, v31.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
+ "ssubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v17.4s, v24.4h, v0.4h\n"
+ "smlal2 v21.4s, v24.8h, v0.8h\n"
+ "add x23, x23, x0\n"
+ "smlal v13.4s, v23.4h, v1.4h\n"
+ "smlal2 v19.4s, v23.8h, v1.8h\n"
+ "smlal v20.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
"smlal2 v7.4s, v24.8h, v1.8h\n"
- "tbz x0, #2, 101f\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "tbz x0, #1, 100f\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[6], [x26]\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[6], [x23]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[4], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[4], [x23]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x0, #1, 102f\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[2], [x26]\n"
+ "tbz x1, #1, 102f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[2], [x23]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 103f\n"
- "ld1 { v27.b }[0], [x26]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v27.b }[0], [x23]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x23, #0xb0]\n"
+ "ldr d2, [x3, #0xb0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "ldr x12, [x20, #0x108]\n"
- "smlal v6.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "add x12, x12, x24\n"
- "smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v16.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v30.4h, v2.4h\n"
- "smlal2 v8.4s, v30.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x15, [x4, #0x108]\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "add x15, x15, x0\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
"smlal2 v7.4s, v27.8h, v2.8h\n"
- "tbz x0, #2, 105f\n"
- "ld1 { v25.s }[0], [x12], #0x4\n"
- "tbz x0, #1, 104f\n"
- "ld1 { v25.h }[2], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[6], [x12]\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v25.s }[0], [x15], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v25.h }[2], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[6], [x15]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[4], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[4], [x15]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x0, #1, 106f\n"
- "ld1 { v25.h }[0], [x12], #0x2\n"
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[2], [x12]\n"
+ "tbz x1, #1, 106f\n"
+ "ld1 { v25.h }[0], [x15], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[2], [x15]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 107f\n"
- "ld1 { v25.b }[0], [x12]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v25.b }[0], [x15]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x23, #0xb8]\n"
+ "ldr d3, [x3, #0xb8]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "ldr x14, [x20, #0x110]\n"
- "smlal v6.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "add x14, x14, x24\n"
- "smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v16.4s, v30.8h, v3.8h\n"
- "smlal v17.4s, v28.4h, v3.4h\n"
- "smlal2 v8.4s, v28.8h, v3.8h\n"
- "smlal v10.4s, v25.4h, v3.4h\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "add x21, x21, x0\n"
+ "smlal v13.4s, v30.4h, v3.4h\n"
+ "smlal2 v19.4s, v30.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal v8.4s, v25.4h, v3.4h\n"
"smlal2 v7.4s, v25.8h, v3.8h\n"
- "tbz x0, #2, 109f\n"
- "ld1 { v24.s }[0], [x14], #0x4\n"
- "tbz x0, #1, 108f\n"
- "ld1 { v24.h }[2], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[6], [x14]\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[6], [x21]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[4], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[4], [x21]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x0, #1, 110f\n"
- "ld1 { v24.h }[0], [x14], #0x2\n"
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[2], [x14]\n"
+ "tbz x1, #1, 110f\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[2], [x21]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 111f\n"
- "ld1 { v24.b }[0], [x14]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v24.b }[0], [x21]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x23, #0xc0]\n"
+ "ldr d4, [x3, #0xc0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x21, [x20, #0x118]\n"
- "smlal v6.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "add x21, x21, x24\n"
- "smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v16.4s, v28.8h, v4.8h\n"
- "smlal v17.4s, v26.4h, v4.4h\n"
- "smlal2 v8.4s, v26.8h, v4.8h\n"
- "smlal v10.4s, v24.4h, v4.4h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "add x20, x20, x0\n"
+ "smlal v13.4s, v28.4h, v4.4h\n"
+ "smlal2 v19.4s, v28.8h, v4.8h\n"
+ "smlal v20.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v8.4s, v24.4h, v4.4h\n"
"smlal2 v7.4s, v24.8h, v4.8h\n"
- "tbz x0, #2, 113f\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x0, #1, 112f\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[6], [x21]\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[4], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x0, #1, 114f\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[2], [x21]\n"
+ "tbz x1, #1, 114f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 115f\n"
- "ld1 { v27.b }[0], [x21]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v6.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x0, #2, 117f\n"
- "ld1 { v12.4s }, [x10], #0x10\n"
- "ld1 { v19.4s }, [x1], #0x10\n"
- "tbz x0, #1, 116f\n"
- "ld1 { v20.d }[0], [x10], #0x8\n"
- "ld1 { v29.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[2], [x10]\n"
- "ld1 { v29.s }[2], [x1]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v18.4s }, [x5], #0x10\n"
+ "ld1 { v6.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v5.d }[0], [x5], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[2], [x5]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v20.s }[0], [x10]\n"
- "ld1 { v29.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v5.s }[0], [x5]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x0, #1, 118f\n"
- "ld1 { v12.d }[0], [x10], #0x8\n"
- "ld1 { v19.d }[0], [x1], #0x8\n"
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[2], [x10]\n"
- "ld1 { v19.s }[2], [x1]\n"
+ "tbz x1, #1, 118f\n"
+ "ld1 { v18.d }[0], [x5], #0x8\n"
+ "ld1 { v6.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[2], [x5]\n"
+ "ld1 { v6.s }[2], [x8]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 119f\n"
- "ld1 { v12.s }[0], [x10]\n"
- "ld1 { v19.s }[0], [x1]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v18.s }[0], [x5]\n"
+ "ld1 { v6.s }[0], [x8]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v12.4s\n"
- "sqrdmulh v17.4s, v17.4s, v12.4s\n"
- "add x16, x16, x22\n"
- "add x8, x8, x22\n"
- "sqrdmulh v10.4s, v10.4s, v12.4s\n"
- "sqrdmulh v6.4s, v6.4s, v12.4s\n"
- "add x4, x4, x22\n"
- "add x7, x7, x22\n"
- "and v23.16b, v15.16b, v19.16b\n"
- "sqrdmulh v16.4s, v16.4s, v20.4s\n"
- "and v22.16b, v17.16b, v19.16b\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "and v21.16b, v10.16b, v19.16b\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "and v26.16b, v6.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v4.16b, v16.16b, v29.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v2.16b, v8.16b, v29.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v3.16b, v7.16b, v29.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "and v30.16b, v13.16b, v6.16b\n"
+ "add x17, x17, x10\n"
+ "add x6, x6, x10\n"
+ "sqrdmulh v19.4s, v19.4s, v5.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x7, x7, x10\n"
+ "add x16, x16, x10\n"
+ "and v16.16b, v19.16b, v22.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v18.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v0.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v5.4s\n"
+ "and v18.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v5.4s\n"
+ "and v30.16b, v17.16b, v6.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v10.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v22.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v25.16b, v5.16b, v29.16b\n"
- "sqadd v15.4s, v15.4s, v23.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v17.4s, v17.4s, v22.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v21.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "sqadd v16.4s, v16.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v7.4s, v7.4s, v3.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v25.4s\n"
- "srshl v16.4s, v16.4s, v29.4s\n"
- "sqxtn v15.4h, v15.4s\n"
- "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v6.4s\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "sqadd v10.4s, v10.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v7.4s, v7.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
"sqxtn v17.4h, v17.4s\n"
- "srshl v7.4s, v7.4s, v29.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v15.8h, v16.4s\n"
- "sqxtn2 v17.8h, v8.4s\n"
- "sqxtn2 v10.8h, v7.4s\n"
- "sqxtn2 v6.8h, v5.4s\n"
- "sqadd v15.8h, v15.8h, v18.8h\n"
- "sqadd v17.8h, v17.8h, v18.8h\n"
- "sqadd v10.8h, v10.8h, v18.8h\n"
- "sqadd v6.8h, v6.8h, v18.8h\n"
- "smax v15.8h, v15.8h, v11.8h\n"
- "smax v17.8h, v17.8h, v11.8h\n"
- "smax v10.8h, v10.8h, v11.8h\n"
- "smax v6.8h, v6.8h, v11.8h\n"
- "smin v15.8h, v15.8h, v13.8h\n"
- "smin v17.8h, v17.8h, v13.8h\n"
- "smin v10.8h, v10.8h, v13.8h\n"
- "smin v6.8h, v6.8h, v13.8h\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqxtn2 v13.8h, v19.4s\n"
+ "sqxtn2 v20.8h, v10.4s\n"
+ "sqxtn2 v8.8h, v7.4s\n"
+ "sqxtn2 v17.8h, v21.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v20.8h, v20.8h, v14.8h\n"
+ "sqadd v8.8h, v8.8h, v14.8h\n"
+ "sqadd v17.8h, v17.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smax v17.8h, v17.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v11.8h\n"
+ "smin v20.8h, v20.8h, v11.8h\n"
+ "smin v8.8h, v8.8h, v11.8h\n"
+ "smin v17.8h, v17.8h, v11.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x0, #2, 121f\n"
- "st1 { v15.s }[0], [x16], #0x4\n"
- "st1 { v17.s }[0], [x8], #0x4\n"
- "st1 { v10.s }[0], [x4], #0x4\n"
- "st1 { v6.s }[0], [x7], #0x4\n"
- "tbz x0, #1, 120f\n"
- "st1 { v15.h }[2], [x16], #0x2\n"
- "st1 { v17.h }[2], [x8], #0x2\n"
- "st1 { v10.h }[2], [x4], #0x2\n"
- "st1 { v6.h }[2], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[6], [x16], #0x1\n"
- "st1 { v17.b }[6], [x8], #0x1\n"
- "st1 { v10.b }[6], [x4], #0x1\n"
- "st1 { v6.b }[6], [x7], #0x1\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v13.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x6], #0x4\n"
+ "st1 { v8.s }[0], [x7], #0x4\n"
+ "st1 { v17.s }[0], [x16], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v13.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x6], #0x2\n"
+ "st1 { v8.h }[2], [x7], #0x2\n"
+ "st1 { v17.h }[2], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x6], #0x1\n"
+ "st1 { v8.b }[6], [x7], #0x1\n"
+ "st1 { v17.b }[6], [x16], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[4], [x16], #0x1\n"
- "st1 { v17.b }[4], [x8], #0x1\n"
- "st1 { v10.b }[4], [x4], #0x1\n"
- "st1 { v6.b }[4], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x6], #0x1\n"
+ "st1 { v8.b }[4], [x7], #0x1\n"
+ "st1 { v17.b }[4], [x16], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x0, #1, 122f\n"
- "st1 { v15.h }[0], [x16], #0x2\n"
- "st1 { v17.h }[0], [x8], #0x2\n"
- "st1 { v10.h }[0], [x4], #0x2\n"
- "st1 { v6.h }[0], [x7], #0x2\n"
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[2], [x16], #0x1\n"
- "st1 { v17.b }[2], [x8], #0x1\n"
- "st1 { v10.b }[2], [x4], #0x1\n"
- "st1 { v6.b }[2], [x7], #0x1\n"
+ "tbz x1, #1, 122f\n"
+ "st1 { v13.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x6], #0x2\n"
+ "st1 { v8.h }[0], [x7], #0x2\n"
+ "st1 { v17.h }[0], [x16], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x6], #0x1\n"
+ "st1 { v8.b }[2], [x7], #0x1\n"
+ "st1 { v17.b }[2], [x16], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x0, #0, 123f\n"
- "st1 { v15.b }[0], [x16], #0x1\n"
- "st1 { v17.b }[0], [x8], #0x1\n"
- "st1 { v10.b }[0], [x4], #0x1\n"
- "st1 { v6.b }[0], [x7], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v13.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x6], #0x1\n"
+ "st1 { v8.b }[0], [x7], #0x1\n"
+ "st1 { v17.b }[0], [x16], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 08a2b7a98e..1676119bc1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,583 +41,577 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
)
{
__asm__ __volatile__(
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v12.4s }, [x19]\n"
+ "lsr x12, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ld1r { v10.16b }, [x19]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v9.16b }, [x20]\n"
- "ld1r { v8.4s }, [x19]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v7.4s }, [x20]\n"
- "ld1r { v6.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
- "ld1r { v5.4s }, [x19]\n"
- "lsr x10, %x[n_channels], #0x2\n"
- "cbz x10, 6f\n"
+ "cbz x12, 6f\n"
"1:" // Channel loop
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q27, [%x[bias], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, %x[n_points], #0x1\n"
- "mov v24.16b, v27.16b\n"
- "ldr s4, [x9, x11]\n"
- "mov v23.16b, v27.16b\n"
- "mov v22.16b, v27.16b\n"
- "ldr s3, [x28, x11]\n"
- "mov v21.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
- "ldr s2, [x27, x11]\n"
- "mov v19.16b, v27.16b\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
- "ldr s1, [x26, x11]\n"
- "usubl v4.8h, v4.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldr s16, [x28, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x27, x11]\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x26, x11]\n"
+ "ldr s19, [x25, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr s20, [x24, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldr s4, [x9, x11]\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "ldr s3, [x28, x11]\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr s2, [x27, x11]\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
- "usubl v4.8h, v4.8b, v10.8b\n"
- "ldr s1, [x26, x11]\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "ldr s0, [x25, x11]\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "ldr s31, [x24, x11]\n"
- "ldp x23, x22, [x20], #0x10\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "ldr s30, [x23, x11]\n"
- "ldr s29, [x22, x11]\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "ldr x21, [x20], #0x8\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "ldr s28, [x21, x11]\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x10, x11]\n"
+ "ldr s15, [x9, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x28, x11]\n"
+ "ldr s17, [x27, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldr s18, [x26, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x25, x11]\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x20, x20, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x24, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x23, x11]\n"
+ "ldr x22, [x21], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x22, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 5f\n"
- "lsl x19, x11, #0x2\n"
- "ldr q6, [%x[rq_mul_ptr], x19]\n"
- "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 5f\n"
- "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
"5:" // Channel loop: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "and v16.16b, v27.16b, v5.16b\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x27, x11]\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x26, x11]\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x25, x11]\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x24, x11]\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x23, x11]\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x22, x11]\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x21, x11]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x20, x11]\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x19, x11]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
"add x11, x11, #0x4\n"
- "cmp x11, x10, LSL #2\n"
+ "cmp x11, x12, LSL #2\n"
"blt 1b\n"
"6:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 24f\n"
- "movi v27.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
"cbz %x[bias], 9f\n"
- "add x19, %x[bias], x11, LSL #2\n"
+ "add x20, %x[bias], x11, LSL #2\n"
"tbz %x[n_channels], #1, 7f\n"
- "ld1 { v27.d }[0], [x19], #0x8\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[2], [x19], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"b 8f\n"
"7:" // Oddments: Load bias: Bit 1: Unset
- "tbz %x[n_channels], #0, 8f\n"
- "ld1 { v27.s }[0], [x19], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"8:" // Oddments: Load bias: Bit 1: End
-
"9:" // Oddments: Load bias: Done
- "mov v26.16b, v27.16b\n"
- "ldr s16, [%x[params]], #0x4\n"
- "mov x20, %x[inptrs]\n"
- "mov v25.16b, v27.16b\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "ldr x22, [x21], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x10, x10, x11\n"
"add x9, x9, x11\n"
- "mov v24.16b, v27.16b\n"
- "ldp x27, x26, [x20], #0x10\n"
- "mov v23.16b, v27.16b\n"
- "ldp x25, x24, [x20], #0x10\n"
- "mov v22.16b, v27.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x28, x28, x11\n"
- "mov v21.16b, v27.16b\n"
- "ldp x23, x22, [x20], #0x10\n"
- "mov v20.16b, v27.16b\n"
"add x27, x27, x11\n"
- "mov v19.16b, v27.16b\n"
- "ldr x21, [x20], #0x8\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
"add x26, x26, x11\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 10f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 11f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"11:" // Oddments: Load: Bit 1: End
- "usubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, %x[n_points], #0x1\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, %x[n_points], #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"ble 15f\n"
"12:" // Oddments: Planar loop
- "smlal v27.4s, v4.4h, v16.4h\n"
- "ldp x9, x28, [x20], #0x10\n"
+ "ldp x10, x9, [x21], #0x10\n"
+ "ldp x28, x27, [x21], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x26, x25, [x21], #0x10\n"
+ "ldp x24, x23, [x21], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x22, [x21], #0x8\n"
+ "add x10, x10, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
"add x9, x9, x11\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "ldp x27, x26, [x20], #0x10\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "ldp x25, x24, [x20], #0x10\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
"add x28, x28, x11\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "ldp x23, x22, [x20], #0x10\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x27, x27, x11\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "ldr x21, [x20], #0x8\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
"add x26, x26, x11\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
- "ldr s16, [%x[params]], #0x4\n"
"add x25, x25, x11\n"
- "ssubl v16.8h, v16.8b, v9.8b\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
- "add x21, x21, x11\n"
"tbz %x[n_channels], #1, 13f\n"
- "ldr h4, [x9], #0x2\n"
- "ldr h3, [x28], #0x2\n"
- "ldr h2, [x27], #0x2\n"
- "ldr h1, [x26], #0x2\n"
- "ldr h0, [x25], #0x2\n"
- "ldr h31, [x24], #0x2\n"
- "ldr h30, [x23], #0x2\n"
- "ldr h29, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
+ "ldr h14, [x10], #0x2\n"
+ "ldr h15, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h17, [x27], #0x2\n"
+ "ldr h18, [x26], #0x2\n"
+ "ldr h19, [x25], #0x2\n"
+ "ldr h20, [x24], #0x2\n"
+ "ldr h21, [x23], #0x2\n"
+ "ldr h22, [x22], #0x2\n"
"tbz %x[n_channels], #0, 14f\n"
- "ld1 { v4.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x28], #0x1\n"
- "ld1 { v2.b }[2], [x27], #0x1\n"
- "ld1 { v1.b }[2], [x26], #0x1\n"
- "ld1 { v0.b }[2], [x25], #0x1\n"
- "ld1 { v31.b }[2], [x24], #0x1\n"
- "ld1 { v30.b }[2], [x23], #0x1\n"
- "ld1 { v29.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v14.b }[2], [x10], #0x1\n"
+ "ld1 { v15.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v17.b }[2], [x27], #0x1\n"
+ "ld1 { v18.b }[2], [x26], #0x1\n"
+ "ld1 { v19.b }[2], [x25], #0x1\n"
+ "ld1 { v20.b }[2], [x24], #0x1\n"
+ "ld1 { v21.b }[2], [x23], #0x1\n"
+ "ld1 { v22.b }[2], [x22], #0x1\n"
"b 14f\n"
"13:" // Oddments: Planar loop: Load: Bit 1: Unset
- "tbz %x[n_channels], #0, 14f\n"
- "ldr b4, [x9], #0x1\n"
- "ldr b3, [x28], #0x1\n"
- "ldr b2, [x27], #0x1\n"
- "ldr b1, [x26], #0x1\n"
- "ldr b0, [x25], #0x1\n"
- "ldr b31, [x24], #0x1\n"
- "ldr b30, [x23], #0x1\n"
- "ldr b29, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
+ "ldr b14, [x10], #0x1\n"
+ "ldr b15, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b17, [x27], #0x1\n"
+ "ldr b18, [x26], #0x1\n"
+ "ldr b19, [x25], #0x1\n"
+ "ldr b20, [x24], #0x1\n"
+ "ldr b21, [x23], #0x1\n"
+ "ldr b22, [x22], #0x1\n"
"14:" // Oddments: Planar loop: Load: Bit 1: End
- "usubl v4.8h, v4.8b, v10.8b\n"
- "subs x19, x19, #0x1\n"
- "usubl v3.8h, v3.8b, v10.8b\n"
- "usubl v2.8h, v2.8b, v10.8b\n"
- "usubl v1.8h, v1.8b, v10.8b\n"
- "usubl v0.8h, v0.8b, v10.8b\n"
- "usubl v31.8h, v31.8b, v10.8b\n"
- "usubl v30.8h, v30.8b, v10.8b\n"
- "usubl v29.8h, v29.8b, v10.8b\n"
- "usubl v28.8h, v28.8b, v10.8b\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
"bgt 12b\n"
"15:" // Oddments: Planar tail
- "smlal v27.4s, v4.4h, v16.4h\n"
- "smlal v26.4s, v3.4h, v16.4h\n"
- "smlal v25.4s, v2.4h, v16.4h\n"
- "smlal v24.4s, v1.4h, v16.4h\n"
- "smlal v23.4s, v0.4h, v16.4h\n"
- "smlal v22.4s, v31.4h, v16.4h\n"
- "smlal v21.4s, v30.4h, v16.4h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal v19.4s, v28.4h, v16.4h\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
"cbz %x[rq_mul_ptr], 21f\n"
- "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v6.d }[0], [x21], #0x8\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
"cbz %x[rq_left_shift_ptr], 16f\n"
- "ld1 { v7.d }[0], [x19], #0x8\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
"16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
"tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[2], [x21], #0x4\n"
- "ld1 { v5.s }[2], [x20], #0x4\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 17f\n"
- "ld1 { v7.s }[2], [x19], #0x4\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
"17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
"b 20f\n"
"18:" // Oddments: Load quantisation parameters: Bit 1: Unset
- "tbz %x[n_channels], #0, 20f\n"
- "ld1 { v6.s }[0], [x21], #0x4\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
"cbz %x[rq_left_shift_ptr], 19f\n"
- "ld1 { v7.s }[0], [x19], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
"20:" // Oddments: Load quantisation parameters: Bit 1: End
"21:" // Oddments: Load quantisation parameters: Done
- "sshl v27.4s, v27.4s, v7.4s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v21.16b, v23.16b, v1.16b\n"
+ "and v20.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
- "sqrdmulh v27.4s, v27.4s, v6.4s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "sshl v26.4s, v26.4s, v7.4s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
"add x26, x26, x11\n"
- "sshl v25.4s, v25.4s, v7.4s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "sshl v24.4s, v24.4s, v7.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
+ "and v19.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
- "and v16.16b, v27.16b, v5.16b\n"
"add x24, x24, x11\n"
- "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
- "sqrdmulh v25.4s, v25.4s, v6.4s\n"
"add x22, x22, x11\n"
- "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
"add x20, x20, x11\n"
- "and v18.16b, v26.16b, v5.16b\n"
- "add x19, x19, x11\n"
- "and v17.16b, v25.16b, v5.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "and v18.16b, v26.16b, v1.16b\n"
+ "and v17.16b, v27.16b, v1.16b\n"
+ "and v16.16b, v28.16b, v1.16b\n"
+ "and v21.16b, v29.16b, v1.16b\n"
+ "and v20.16b, v30.16b, v1.16b\n"
+ "and v19.16b, v31.16b, v1.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v24.16b, v5.16b\n"
- "srshl v27.4s, v27.4s, v5.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "add v27.4s, v27.4s, v8.4s\n"
- "srshl v26.4s, v26.4s, v5.4s\n"
- "srshl v25.4s, v25.4s, v5.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "smax v27.4s, v27.4s, v12.4s\n"
- "add v26.4s, v26.4s, v8.4s\n"
- "add v25.4s, v25.4s, v8.4s\n"
- "srshl v24.4s, v24.4s, v5.4s\n"
- "smin v27.4s, v27.4s, v11.4s\n"
- "smax v26.4s, v26.4s, v12.4s\n"
- "smax v25.4s, v25.4s, v12.4s\n"
- "add v24.4s, v24.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smax v24.4s, v24.4s, v12.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v24.4s, v24.4s, v11.4s\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sshl v23.4s, v23.4s, v7.4s\n"
- "sshl v22.4s, v22.4s, v7.4s\n"
- "sqrdmulh v23.4s, v23.4s, v6.4s\n"
- "sqrdmulh v22.4s, v22.4s, v6.4s\n"
- "sshl v21.4s, v21.4s, v7.4s\n"
- "sshl v20.4s, v20.4s, v7.4s\n"
- "and v17.16b, v23.16b, v5.16b\n"
- "and v16.16b, v22.16b, v5.16b\n"
- "sqrdmulh v21.4s, v21.4s, v6.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v6.4s\n"
- "sqadd v23.4s, v23.4s, v17.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v16.16b, v21.16b, v5.16b\n"
- "and v17.16b, v20.16b, v5.16b\n"
- "srshl v23.4s, v23.4s, v5.4s\n"
- "srshl v22.4s, v22.4s, v5.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "add v23.4s, v23.4s, v8.4s\n"
- "add v22.4s, v22.4s, v8.4s\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "smax v23.4s, v23.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v5.4s\n"
- "srshl v20.4s, v20.4s, v5.4s\n"
- "smin v23.4s, v23.4s, v11.4s\n"
- "smin v22.4s, v22.4s, v11.4s\n"
- "add v21.4s, v21.4s, v8.4s\n"
- "add v20.4s, v20.4s, v8.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smax v21.4s, v21.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v12.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "sqrdmulh v19.4s, v19.4s, v6.4s\n"
- "and v16.16b, v19.16b, v5.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v5.4s\n"
- "add v19.4s, v19.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v11.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_channels], #1, 22f\n"
- "st1 { v27.h }[0], [x27], #0x2\n"
- "st1 { v26.h }[0], [x26], #0x2\n"
- "st1 { v25.h }[0], [x25], #0x2\n"
- "st1 { v24.h }[0], [x24], #0x2\n"
- "st1 { v23.h }[0], [x23], #0x2\n"
- "st1 { v22.h }[0], [x22], #0x2\n"
- "st1 { v21.h }[0], [x21], #0x2\n"
- "st1 { v20.h }[0], [x20], #0x2\n"
- "st1 { v19.h }[0], [x19], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[2], [x27], #0x1\n"
- "st1 { v26.b }[2], [x26], #0x1\n"
- "st1 { v25.b }[2], [x25], #0x1\n"
- "st1 { v24.b }[2], [x24], #0x1\n"
- "st1 { v23.b }[2], [x23], #0x1\n"
- "st1 { v22.b }[2], [x22], #0x1\n"
- "st1 { v21.b }[2], [x21], #0x1\n"
- "st1 { v20.b }[2], [x20], #0x1\n"
- "st1 { v19.b }[2], [x19], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: Store: Bit 1: Unset
- "tbz %x[n_channels], #0, 23f\n"
- "st1 { v27.b }[0], [x27], #0x1\n"
- "st1 { v26.b }[0], [x26], #0x1\n"
- "st1 { v25.b }[0], [x25], #0x1\n"
- "st1 { v24.b }[0], [x24], #0x1\n"
- "st1 { v23.b }[0], [x23], #0x1\n"
- "st1 { v22.b }[0], [x22], #0x1\n"
- "st1 { v21.b }[0], [x21], #0x1\n"
- "st1 { v20.b }[0], [x20], #0x1\n"
- "st1 { v19.b }[0], [x19], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
"23:" // Oddments: Store: Bit 1: End
"24:" // End
: [params] "+&r" (params)
: [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 09b274056f..976434aa28 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,1439 +45,1433 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
)
{
__asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v3.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
"mov x9, #0x0\n"
- "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v13.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v12.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v11.16b }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v10.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v9.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
- "ld1r { v8.4s }, [x19]\n"
- "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v7.4s }, [x19]\n"
- "lsr x28, %x[n_output_channels], #0x2\n"
- "cbz x28, 9f\n"
+ "cbz x10, 9f\n"
"1:" // Output channel loop
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q16, [%x[bias], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
"2:" // Output channel loop: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 3f\n"
- "lsl x19, x9, #0x2\n"
- "ldr q8, [%x[rq_mul_ptr], x19]\n"
- "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q10, [%x[rq_right_shift_ptr], x20]\n"
"cbz %x[rq_left_shift_ptr], 3f\n"
- "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "ldr q15, [%x[rq_left_shift_ptr], x20]\n"
"3:" // Output channel loop: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 7f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 7f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
"beq 5f\n"
"4:" // Output channel loop: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"bgt 4b\n"
"5:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 6f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "6:" // Output channel loop: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldr d7, [x28, #0x0]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
- "b 8f\n"
- "7:" // Output channel loop: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s6, [x19, x9]\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "str s5, [x20, x9]\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s4, [x21, x9]\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s31, [x22, x9]\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s30, [x23, x9]\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s29, [x24, x9]\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s28, [x25, x9]\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s27, [x26, x9]\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x19, x9]\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x20, x9]\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x21, x9]\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s23, [x22, x9]\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x23, x9]\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s21, [x24, x9]\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s20, [x25, x9]\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s19, [x26, x9]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x20, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x21, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x22, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x23, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x24, x9]\n"
+ "str s29, [x25, x9]\n"
+ "str s30, [x26, x9]\n"
+ "str s31, [x27, x9]\n"
"8:" // Output channel loop: Done
"add x9, x9, #0x4\n"
- "cmp x9, x28, LSL #2\n"
+ "cmp x9, x10, LSL #2\n"
"blt 1b\n"
"tst %x[n_output_channels], #0x3\n"
"beq 26f\n"
"9:" // Output channel oddments
- "movi v16.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
"cbz %x[bias], 12f\n"
- "add x19, %x[bias], x9, LSL #2\n"
+ "add x20, %x[bias], x9, LSL #2\n"
"tbz %x[n_output_channels], #1, 10f\n"
- "ld1 { v16.d }[0], [x19], #0x8\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[2], [x19]\n"
+ "ld1 { v31.s }[2], [x20]\n"
"b 11f\n"
"10:" // Output channel oddments: Load bias: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 11f\n"
- "ld1 { v16.s }[0], [x19]\n"
+ "ld1 { v31.s }[0], [x20]\n"
"11:" // Output channel oddments: Load bias: Bit 1: End
-
"12:" // Output channel oddments: Load bias: Done
- "mov v6.16b, v16.16b\n"
- "mov v5.16b, v16.16b\n"
- "mov v4.16b, v16.16b\n"
- "mov v31.16b, v16.16b\n"
- "mov v30.16b, v16.16b\n"
- "mov v29.16b, v16.16b\n"
- "mov v28.16b, v16.16b\n"
- "mov v27.16b, v16.16b\n"
- "mov v26.16b, v16.16b\n"
- "mov v25.16b, v16.16b\n"
- "mov v24.16b, v16.16b\n"
- "mov v23.16b, v16.16b\n"
- "mov v22.16b, v16.16b\n"
- "mov v21.16b, v16.16b\n"
- "mov v20.16b, v16.16b\n"
- "mov v19.16b, v16.16b\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
"cbz %x[rq_mul_ptr], 18f\n"
- "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
- "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
- "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
"cbz %x[rq_left_shift_ptr], 15f\n"
"tbz %x[n_output_channels], #1, 13f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
- "ld1 { v9.d }[0], [x19], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
- "ld1 { v9.s }[2], [x19], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
"b 14f\n"
"13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 14f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
- "ld1 { v9.s }[0], [x19], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
"14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
"b 18f\n"
"15:" // Output channel oddments: Load quantization parameters: No left shift
"tbz %x[n_output_channels], #1, 16f\n"
- "ld1 { v8.d }[0], [x21], #0x8\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
"tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[2], [x21], #0x4\n"
- "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
"b 17f\n"
"16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 17f\n"
- "ld1 { v8.s }[0], [x21], #0x4\n"
- "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
"17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
-
"18:" // Output channel oddments: Load quantization parameters: Done
- "ldr s17, [%x[weights]], #0x4\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "mov x19, %x[inptrs]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "lsr x20, %x[kernel_points], #0x1\n"
- "ldr d3, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "cbz x20, 22f\n"
- "ldp x25, x27, [x19], #0x10\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "lsr x21, %x[kernel_points], #0x1\n"
+ "ldr d2, [x25, #0x0]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "cbz x21, 22f\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"ldr d1, [x25, #0x0]\n"
- "subs x20, x20, #0x1\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d0, [x28, #0x0]\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
"beq 20f\n"
"19:" // Output channel oddments: Kernel loop
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "subs x20, x20, #0x1\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "subs x21, x21, #0x1\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
"ldr d1, [x25, #0x0]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "ldr d0, [x27, #0x0]\n"
- "usubl v1.8h, v1.8b, v12.8b\n"
- "ldr s16, [%x[weights]], #0x4\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
- "ssubl v16.8h, v16.8b, v11.8b\n"
+ "usubl v1.8h, v1.8b, v3.8b\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "ldr d0, [x28, #0x0]\n"
+ "ldr s6, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v3.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
"bgt 19b\n"
"20:" // Output channel oddments: Kernel loop tail
"tbnz %x[kernel_points], #0, 21f\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
"b 23f\n"
"21:" // Output channel oddments: Odd tail
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "ldp x25, x27, [x19], #0x10\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "ldr d3, [x25, #0x0]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
- "ldr d2, [x27, #0x0]\n"
- "usubl v3.8h, v3.8b, v12.8b\n"
- "ldr s17, [%x[weights]], #0x4\n"
- "smlal v6.4s, v16.4h, v1.h[0]\n"
- "smlal v5.4s, v16.4h, v1.h[1]\n"
- "smlal v4.4s, v16.4h, v1.h[2]\n"
- "usubl v2.8h, v2.8b, v12.8b\n"
- "ssubl v17.8h, v17.8b, v11.8b\n"
- "smlal v31.4s, v16.4h, v1.h[3]\n"
- "smlal v30.4s, v16.4h, v1.h[4]\n"
- "smlal v29.4s, v16.4h, v1.h[5]\n"
- "smlal v28.4s, v16.4h, v1.h[6]\n"
- "smlal v27.4s, v16.4h, v1.h[7]\n"
- "smlal v26.4s, v16.4h, v0.h[0]\n"
- "smlal v25.4s, v16.4h, v0.h[1]\n"
- "smlal v24.4s, v16.4h, v0.h[2]\n"
- "smlal v23.4s, v16.4h, v0.h[3]\n"
- "smlal v22.4s, v16.4h, v0.h[4]\n"
- "smlal v21.4s, v16.4h, v0.h[5]\n"
- "smlal v20.4s, v16.4h, v0.h[6]\n"
- "smlal v19.4s, v16.4h, v0.h[7]\n"
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldp x25, x28, [x20], #0x10\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "ldr d2, [x25, #0x0]\n"
+ "usubl v2.8h, v2.8b, v3.8b\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
+ "ldr d7, [x28, #0x0]\n"
+ "ldr s8, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v6.4h, v1.h[0]\n"
+ "smlal v17.4s, v6.4h, v1.h[1]\n"
+ "usubl v7.8h, v7.8b, v3.8b\n"
+ "smlal v18.4s, v6.4h, v1.h[2]\n"
+ "smlal v19.4s, v6.4h, v1.h[3]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "smlal v20.4s, v6.4h, v1.h[4]\n"
+ "smlal v21.4s, v6.4h, v1.h[5]\n"
+ "smlal v22.4s, v6.4h, v1.h[6]\n"
+ "smlal v23.4s, v6.4h, v1.h[7]\n"
+ "smlal v24.4s, v6.4h, v0.h[0]\n"
+ "smlal v25.4s, v6.4h, v0.h[1]\n"
+ "smlal v26.4s, v6.4h, v0.h[2]\n"
+ "smlal v27.4s, v6.4h, v0.h[3]\n"
+ "smlal v28.4s, v6.4h, v0.h[4]\n"
+ "smlal v29.4s, v6.4h, v0.h[5]\n"
+ "smlal v30.4s, v6.4h, v0.h[6]\n"
+ "smlal v31.4s, v6.4h, v0.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"b 23f\n"
"22:" // Output channel oddments: Single kernel point
- "smlal v6.4s, v17.4h, v3.h[0]\n"
- "smlal v5.4s, v17.4h, v3.h[1]\n"
- "smlal v4.4s, v17.4h, v3.h[2]\n"
- "smlal v31.4s, v17.4h, v3.h[3]\n"
- "smlal v30.4s, v17.4h, v3.h[4]\n"
- "smlal v29.4s, v17.4h, v3.h[5]\n"
- "smlal v28.4s, v17.4h, v3.h[6]\n"
- "smlal v27.4s, v17.4h, v3.h[7]\n"
- "smlal v26.4s, v17.4h, v2.h[0]\n"
- "smlal v25.4s, v17.4h, v2.h[1]\n"
- "smlal v24.4s, v17.4h, v2.h[2]\n"
- "smlal v23.4s, v17.4h, v2.h[3]\n"
- "smlal v22.4s, v17.4h, v2.h[4]\n"
- "smlal v21.4s, v17.4h, v2.h[5]\n"
- "smlal v20.4s, v17.4h, v2.h[6]\n"
- "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v16.4s, v8.4h, v2.h[0]\n"
+ "smlal v17.4s, v8.4h, v2.h[1]\n"
+ "smlal v18.4s, v8.4h, v2.h[2]\n"
+ "smlal v19.4s, v8.4h, v2.h[3]\n"
+ "smlal v20.4s, v8.4h, v2.h[4]\n"
+ "smlal v21.4s, v8.4h, v2.h[5]\n"
+ "smlal v22.4s, v8.4h, v2.h[6]\n"
+ "smlal v23.4s, v8.4h, v2.h[7]\n"
+ "smlal v24.4s, v8.4h, v7.h[0]\n"
+ "smlal v25.4s, v8.4h, v7.h[1]\n"
+ "smlal v26.4s, v8.4h, v7.h[2]\n"
+ "smlal v27.4s, v8.4h, v7.h[3]\n"
+ "smlal v28.4s, v8.4h, v7.h[4]\n"
+ "smlal v29.4s, v8.4h, v7.h[5]\n"
+ "smlal v30.4s, v8.4h, v7.h[6]\n"
+ "smlal v31.4s, v8.4h, v7.h[7]\n"
"23:" // Output channel oddments: Done
- "sshl v6.4s, v6.4s, v9.4s\n"
- "sshl v5.4s, v5.4s, v9.4s\n"
- "sshl v4.4s, v4.4s, v9.4s\n"
- "sqrdmulh v6.4s, v6.4s, v8.4s\n"
- "sqrdmulh v5.4s, v5.4s, v8.4s\n"
- "sqrdmulh v4.4s, v4.4s, v8.4s\n"
- "sshl v31.4s, v31.4s, v9.4s\n"
- "and v18.16b, v6.16b, v7.16b\n"
- "and v16.16b, v5.16b, v7.16b\n"
- "and v17.16b, v4.16b, v7.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "sqadd v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
- "srshl v6.4s, v6.4s, v7.4s\n"
- "srshl v5.4s, v5.4s, v7.4s\n"
- "srshl v4.4s, v4.4s, v7.4s\n"
- "and v16.16b, v31.16b, v7.16b\n"
- "add v6.4s, v6.4s, v10.4s\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "add v4.4s, v4.4s, v10.4s\n"
- "smin v6.4s, v6.4s, v13.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smin v4.4s, v4.4s, v13.4s\n"
- "smax v6.4s, v6.4s, v14.4s\n"
- "smax v5.4s, v5.4s, v14.4s\n"
- "smax v4.4s, v4.4s, v14.4s\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "sshl v30.4s, v30.4s, v9.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "sshl v29.4s, v29.4s, v9.4s\n"
- "sshl v28.4s, v28.4s, v9.4s\n"
- "srshl v31.4s, v31.4s, v7.4s\n"
- "and v16.16b, v30.16b, v7.16b\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "add v31.4s, v31.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v17.16b, v29.16b, v7.16b\n"
- "smin v31.4s, v31.4s, v13.4s\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smax v31.4s, v31.4s, v14.4s\n"
- "and v16.16b, v28.16b, v7.16b\n"
- "srshl v30.4s, v30.4s, v7.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "add v30.4s, v30.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v29.4s, v29.4s, v7.4s\n"
- "smin v30.4s, v30.4s, v13.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "sshl v27.4s, v27.4s, v9.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v10.4s\n"
- "srshl v28.4s, v28.4s, v7.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "smin v29.4s, v29.4s, v13.4s\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "add v28.4s, v28.4s, v10.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
- "smin v28.4s, v28.4s, v13.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "sshl v26.4s, v26.4s, v9.4s\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "and v16.16b, v27.16b, v7.16b\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "and v17.16b, v26.16b, v7.16b\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "sshl v25.4s, v25.4s, v9.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
- "srshl v27.4s, v27.4s, v7.4s\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "sshl v24.4s, v24.4s, v9.4s\n"
- "and v16.16b, v25.16b, v7.16b\n"
- "add v27.4s, v27.4s, v10.4s\n"
- "srshl v26.4s, v26.4s, v7.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v27.4s, v27.4s, v13.4s\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "add v26.4s, v26.4s, v10.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "sqadd v25.4s, v25.4s, v16.4s\n"
- "smin v26.4s, v26.4s, v13.4s\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "and v17.16b, v24.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "srshl v25.4s, v25.4s, v7.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "sshl v23.4s, v23.4s, v9.4s\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "add v25.4s, v25.4s, v10.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "smin v25.4s, v25.4s, v13.4s\n"
- "sshl v22.4s, v22.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v7.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "and v16.16b, v23.16b, v7.16b\n"
- "sqrdmulh v22.4s, v22.4s, v8.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "add v24.4s, v24.4s, v10.4s\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v24.4s, v24.4s, v13.4s\n"
- "and v17.16b, v22.16b, v7.16b\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshl v21.4s, v21.4s, v9.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "srshl v23.4s, v23.4s, v7.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "sqadd v22.4s, v22.4s, v17.4s\n"
- "sqrdmulh v21.4s, v21.4s, v8.4s\n"
- "add v23.4s, v23.4s, v10.4s\n"
- "sshl v20.4s, v20.4s, v9.4s\n"
- "srshl v22.4s, v22.4s, v7.4s\n"
- "smin v23.4s, v23.4s, v13.4s\n"
- "and v16.16b, v21.16b, v7.16b\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smax v23.4s, v23.4s, v14.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "smin v22.4s, v22.4s, v13.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v21.4s, v21.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v14.4s\n"
- "and v16.16b, v20.16b, v7.16b\n"
- "sshl v19.4s, v19.4s, v9.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "srshl v21.4s, v21.4s, v7.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v19.4s, v19.4s, v8.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "sqadd v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "and v16.16b, v19.16b, v7.16b\n"
- "srshl v20.4s, v20.4s, v7.4s\n"
- "smax v21.4s, v21.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "add v20.4s, v20.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "smin v20.4s, v20.4s, v13.4s\n"
- "srshl v19.4s, v19.4s, v7.4s\n"
- "smax v20.4s, v20.4s, v14.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "add v19.4s, v19.4s, v10.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "smin v19.4s, v19.4s, v13.4s\n"
- "smax v19.4s, v19.4s, v14.4s\n"
+ "sshl v16.4s, v16.4s, v15.4s\n"
+ "sshl v17.4s, v17.4s, v15.4s\n"
+ "sshl v18.4s, v18.4s, v15.4s\n"
+ "sshl v19.4s, v19.4s, v15.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v5.16b, v16.16b, v10.16b\n"
+ "and v4.16b, v17.16b, v10.16b\n"
+ "and v2.16b, v18.16b, v10.16b\n"
+ "and v1.16b, v19.16b, v10.16b\n"
+ "sshl v20.4s, v20.4s, v15.4s\n"
+ "sshl v21.4s, v21.4s, v15.4s\n"
+ "sshl v22.4s, v22.4s, v15.4s\n"
+ "sshl v23.4s, v23.4s, v15.4s\n"
+ "sshl v24.4s, v24.4s, v15.4s\n"
+ "sshl v25.4s, v25.4s, v15.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v5.4s\n"
+ "sqadd v17.4s, v17.4s, v4.4s\n"
+ "sqadd v18.4s, v18.4s, v2.4s\n"
+ "sqadd v19.4s, v19.4s, v1.4s\n"
+ "and v8.16b, v20.16b, v10.16b\n"
+ "and v0.16b, v21.16b, v10.16b\n"
+ "and v5.16b, v22.16b, v10.16b\n"
+ "and v4.16b, v23.16b, v10.16b\n"
+ "and v2.16b, v24.16b, v10.16b\n"
+ "and v1.16b, v25.16b, v10.16b\n"
+ "sshl v26.4s, v26.4s, v15.4s\n"
+ "sshl v27.4s, v27.4s, v15.4s\n"
+ "sshl v28.4s, v28.4s, v15.4s\n"
+ "sshl v29.4s, v29.4s, v15.4s\n"
+ "sshl v30.4s, v30.4s, v15.4s\n"
+ "sshl v31.4s, v31.4s, v15.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v22.4s, v22.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sqadd v25.4s, v25.4s, v1.4s\n"
+ "and v8.16b, v26.16b, v10.16b\n"
+ "and v0.16b, v27.16b, v10.16b\n"
+ "and v5.16b, v28.16b, v10.16b\n"
+ "and v4.16b, v29.16b, v10.16b\n"
+ "and v2.16b, v30.16b, v10.16b\n"
+ "and v1.16b, v31.16b, v10.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v8.4s\n"
+ "sqadd v27.4s, v27.4s, v0.4s\n"
+ "sqadd v28.4s, v28.4s, v5.4s\n"
+ "sqadd v29.4s, v29.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v2.4s\n"
+ "sqadd v31.4s, v31.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v10.4s\n"
+ "srshl v21.4s, v21.4s, v10.4s\n"
+ "srshl v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v10.4s\n"
+ "srshl v27.4s, v27.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v10.4s\n"
+ "srshl v29.4s, v29.4s, v10.4s\n"
+ "srshl v30.4s, v30.4s, v10.4s\n"
+ "srshl v31.4s, v31.4s, v10.4s\n"
+ "add v16.4s, v16.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "smin v17.4s, v17.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v13.4s\n"
+ "smax v17.4s, v17.4s, v13.4s\n"
+ "smax v18.4s, v18.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v23.4s, v23.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
"tbz %x[n_output_channels], #1, 24f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.h }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.h }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.h }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.h }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.h }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.h }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.h }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.h }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.h }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.h }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.h }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.h }[0], [x25]\n"
+ "st1 { v23.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
"add x9, x9, #0x2\n"
- "st1 { v19.h }[0], [x26]\n"
+ "st1 { v24.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x21]\n"
+ "st1 { v26.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x23]\n"
+ "st1 { v28.h }[0], [x24]\n"
+ "st1 { v29.h }[0], [x25]\n"
+ "st1 { v30.h }[0], [x26]\n"
+ "st1 { v31.h }[0], [x27]\n"
"tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[2], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[2], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[2], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[2], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[2], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[2], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[2], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[2], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[2], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[2], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[2], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v19.b }[2], [x26]\n"
+ "st1 { v23.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v25.b }[2], [x21]\n"
+ "st1 { v26.b }[2], [x22]\n"
+ "st1 { v27.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x24]\n"
+ "st1 { v29.b }[2], [x25]\n"
+ "st1 { v30.b }[2], [x26]\n"
+ "st1 { v31.b }[2], [x27]\n"
"b 25f\n"
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
- "tbz %x[n_output_channels], #0, 25f\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "add x19, x19, x9\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
"add x20, x20, x9\n"
- "st1 { v6.b }[0], [x19]\n"
"add x21, x21, x9\n"
- "st1 { v5.b }[0], [x20]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
"add x22, x22, x9\n"
- "st1 { v4.b }[0], [x21]\n"
"add x23, x23, x9\n"
- "st1 { v31.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
"add x24, x24, x9\n"
- "st1 { v30.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
"add x25, x25, x9\n"
- "st1 { v29.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
"add x26, x26, x9\n"
- "st1 { v28.b }[0], [x25]\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "add x19, x19, x9\n"
- "st1 { v27.b }[0], [x26]\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x27, x27, x9\n"
+ "st1 { v16.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"add x20, x20, x9\n"
- "st1 { v26.b }[0], [x19]\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
+ "st1 { v17.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
"add x21, x21, x9\n"
- "st1 { v25.b }[0], [x20]\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
+ "st1 { v18.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
"add x22, x22, x9\n"
- "st1 { v24.b }[0], [x21]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "st1 { v19.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
"add x23, x23, x9\n"
- "st1 { v23.b }[0], [x22]\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
+ "st1 { v20.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
"add x24, x24, x9\n"
- "st1 { v22.b }[0], [x23]\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
+ "st1 { v21.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
"add x25, x25, x9\n"
- "st1 { v21.b }[0], [x24]\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
+ "st1 { v22.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
"add x26, x26, x9\n"
- "st1 { v20.b }[0], [x25]\n"
- "st1 { v19.b }[0], [x26]\n"
+ "st1 { v23.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "add x27, x27, x9\n"
+ "st1 { v24.b }[0], [x20]\n"
+ "st1 { v25.b }[0], [x21]\n"
+ "st1 { v26.b }[0], [x22]\n"
+ "st1 { v27.b }[0], [x23]\n"
+ "st1 { v28.b }[0], [x24]\n"
+ "st1 { v29.b }[0], [x25]\n"
+ "st1 { v30.b }[0], [x26]\n"
+ "st1 { v31.b }[0], [x27]\n"
"25:" // Output channel oddments: Done: Store: Bit 1: End
"26:" // Done
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 4c4247834c..2ee961db15 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,243 +90,243 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
".inst 0xd503477f // SMSTART ZA\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
+ "mov x4, #0x0\n"
"mov x5, #0x0\n"
- "mov x6, #0x0\n"
"1:" // Tile loop
- "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x21, #0x2\n"
- "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x19, x5, x20\n" // offset = tile_i * ld_input_row
- "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "madd x19, x6, x7, x19\n" // offset += tile_j * ld_input_col
- "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x17, x8, x20, LSL #2\n"
- "add x16, x17, x20, LSL #2\n"
- "add x15, x7, x7\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x13, x16, x20, LSL #2\n"
- "add x12, x15, x7\n"
- "cbnz x6, 2f\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x20, x19, x6\n"
- "sub x20, x20, #0x1\n"
- "lsl x11, %x[n_channels], #0x2\n"
- "mov x19, #0x8\n"
- "and x20, x20, #0x3fffff\n"
- "mul x19, x19, x7\n"
- "orr x11, x11, x20, LSL #22\n"
- "orr x11, x11, x19, LSL #38\n"
- "add x10, x17, x7, LSL #2\n"
- "add x9, x8, x12, LSL #2\n"
- "add x28, x17, x15, LSL #2\n"
- "add x27, x16, x7, LSL #2\n"
- "add x26, x13, x12, LSL #2\n"
- "add x25, x8, x7, LSL #2\n"
- "add x24, x8, x15, LSL #2\n"
- "add x23, x16, x15, LSL #2\n"
- "add x22, x17, x12, LSL #2\n"
- "add x21, x16, x12, LSL #2\n"
- "add x20, x13, x7, LSL #2\n"
- "add x19, x13, x15, LSL #2\n"
- ".inst 0xf8ab495a // rprfm pldonce, x10, [x11]\n"
- ".inst 0xf8ab491a // rprfm pldonce, x8, [x11]\n"
- ".inst 0xf8ab493a // rprfm pldonce, x9, [x11]\n"
- ".inst 0xf8ab4b9a // rprfm pldonce, x28, [x11]\n"
- ".inst 0xf8ab4b7a // rprfm pldonce, x27, [x11]\n"
- ".inst 0xf8ab49ba // rprfm pldonce, x13, [x11]\n"
- ".inst 0xf8ab4b5a // rprfm pldonce, x26, [x11]\n"
- ".inst 0xf8ab4b3a // rprfm pldonce, x25, [x11]\n"
- ".inst 0xf8ab4b1a // rprfm pldonce, x24, [x11]\n"
- ".inst 0xf8ab4afa // rprfm pldonce, x23, [x11]\n"
- ".inst 0xf8ab4a3a // rprfm pldonce, x17, [x11]\n"
- ".inst 0xf8ab4ada // rprfm pldonce, x22, [x11]\n"
- ".inst 0xf8ab4a1a // rprfm pldonce, x16, [x11]\n"
- ".inst 0xf8ab4aba // rprfm pldonce, x21, [x11]\n"
- ".inst 0xf8ab4a9a // rprfm pldonce, x20, [x11]\n"
- ".inst 0xf8ab4a7a // rprfm pldonce, x19, [x11]\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x2\n"
+ "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x4, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "madd x20, x5, x6, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x7, x7, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x8, x7, x21, LSL #2\n"
+ "add x17, x8, x21, LSL #2\n"
+ "add x16, x6, x6\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x14, x17, x21, LSL #2\n"
+ "add x13, x16, x6\n"
+ "cbnz x5, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x5\n"
+ "sub x21, x21, #0x1\n"
+ "lsl x12, %x[n_channels], #0x2\n"
+ "mov x20, #0x8\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x6\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x11, x8, x6, LSL #2\n"
+ "add x10, x7, x13, LSL #2\n"
+ "add x9, x8, x16, LSL #2\n"
+ "add x28, x17, x6, LSL #2\n"
+ "add x27, x14, x13, LSL #2\n"
+ "add x26, x7, x6, LSL #2\n"
+ "add x25, x7, x16, LSL #2\n"
+ "add x24, x17, x16, LSL #2\n"
+ "add x23, x8, x13, LSL #2\n"
+ "add x22, x17, x13, LSL #2\n"
+ "add x21, x14, x6, LSL #2\n"
+ "add x20, x14, x16, LSL #2\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4a3a // rprfm pldonce, x12, [x17]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x5, x21\n" // offset = tile_i * ld_output_row
- "mov x19, #0x2\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x20, x6, x24, x20\n" // offset += tile_j * ld_output_col
- "addvl x14, x14, #1\n"
- ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mul x20, x20, x19\n" // offset *= output_tile_size
- "cntw x22\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x4, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x2\n"
+ "ld1w { z18.s }, p3/Z, [x15]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x5, x25, x21\n" // offset += tile_j * ld_output_col
+ "addvl x15, x15, #1\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "cntw x23\n"
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "addvl x14, x14, #4\n"
- "add x23, x23, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "addvl x15, x15, #4\n"
+ "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "addvl x14, x14, #4\n"
+ "addvl x15, x15, #4\n"
"ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x22, %x[n_channels]\n"
- "add x21, x23, x21, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x14]\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
- "ld1w { z9.s }, p2/Z, [x17, x7, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x8]\n"
- "addvl x14, x14, #1\n"
- "ld1w { z11.s }, p2/Z, [x8, x12, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x17, x15, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x16, x7, LSL #2]\n"
+ "cmp x23, %x[n_channels]\n"
+ "add x22, x24, x22, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "ld1w { z9.s }, p2/Z, [x8, x6, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x7]\n"
+ "addvl x15, x15, #1\n"
+ "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x8, x16, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
"movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "whilelt p1.s, x22, %x[n_channels]\n"
- "incw x20\n"
+ "whilelt p1.s, x23, %x[n_channels]\n"
+ "incw x21\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "incw x22\n"
+ "ld1w { z9.s }, p2/Z, [x14]\n"
+ "incw x23\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x12, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
- "incw x19\n"
+ "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "incw x20\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
"fmla z30.s, p3/M, z6.s, z9.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x8, x15, LSL #2]\n"
- "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "addvl x7, x7, #1\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ld1w { z18.s }, p3/Z, [x14]\n"
- "addvl x14, x14, #1\n"
+ "ld1w { z18.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x17]\n"
+ "ld1w { z11.s }, p2/Z, [x8]\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x17, x12, LSL #2]\n"
- "addvl x17, x17, #1\n"
+ "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "addvl x8, x8, #1\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x16]\n"
+ "ld1w { z9.s }, p2/Z, [x17]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x12, LSL #2]\n"
- "addvl x16, x16, #1\n"
+ "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "addvl x17, x17, #1\n"
"fmla z30.s, p3/M, z3.s, z9.s\n"
"fmla z31.s, p3/M, z5.s, z10.s\n"
- "ld1w { z13.s }, p1/Z, [x16, x7, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"fmla z30.s, p3/M, z7.s, z11.s\n"
"fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x15, LSL #2]\n"
- "whilelt p2.s, x20, %x[n_channels]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"fmla z28.s, p3/M, z6.s, z9.s\n"
"fmla z29.s, p3/M, z8.s, z10.s\n"
- ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
"fmla z30.s, p3/M, z8.s, z12.s\n"
"fmla z31.s, p3/M, z7.s, z12.s\n"
- ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- "cmp x22, %x[n_channels]\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "cmp x23, %x[n_channels]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "addvl x13, x13, #1\n"
- "ld1w { z9.s }, p1/Z, [x17, x7, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x8]\n"
- "st1w { z28.s }, p0, [x23]\n"
- "ld1w { z11.s }, p1/Z, [x8, x12, LSL #2]\n"
- "st1w { z29.s }, p0, [x23, x24, LSL #2]\n"
- "addvl x23, x23, #1\n"
- "ld1w { z12.s }, p1/Z, [x17, x15, LSL #2]\n"
- "st1w { z30.s }, p0, [x21]\n"
- "st1w { z31.s }, p0, [x21, x24, LSL #2]\n"
- "addvl x21, x21, #1\n"
- "ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
+ "ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x7]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "ld1w { z11.s }, p1/Z, [x7, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "ld1w { z12.s }, p1/Z, [x8, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
"movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x6, x6, #0x1\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x5, x5, #0x1\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ld1w { z9.s }, p2/Z, [x14]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x12, LSL #2]\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ld1w { z10.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
- "cmp x6, x19\n"
+ "ld1w { z12.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "cmp x5, x20\n"
"fmla z30.s, p3/M, z6.s, z9.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x8, x15, LSL #2]\n"
- "add x19, x5, #0x1\n"
+ "ld1w { z9.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "add x20, x4, #0x1\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "csel x5, x5, x19, LT\n"
+ "csel x4, x4, x20, LT\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x17]\n"
- "csel x6, x6, XZR, LT\n"
+ "ld1w { z11.s }, p2/Z, [x8]\n"
+ "csel x5, x5, XZR, LT\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x17, x12, LSL #2]\n"
- "cmp x5, x20\n"
+ "ld1w { z12.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "cmp x4, x21\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x16]\n"
+ "ld1w { z9.s }, p2/Z, [x17]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x12, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x17, x13, LSL #2]\n"
"fmla z30.s, p3/M, z3.s, z9.s\n"
"fmla z31.s, p3/M, z5.s, z10.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x6, LSL #2]\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"fmla z30.s, p3/M, z7.s, z11.s\n"
"fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x16, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z9.s\n"
"fmla z29.s, p3/M, z8.s, z10.s\n"
"fmla z30.s, p3/M, z8.s, z12.s\n"
"fmla z31.s, p3/M, z7.s, z12.s\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x21]\n"
- "st1w { z31.s }, p0, [x21, x24, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 5fc6602c91..079b39c5ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,196 +78,196 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
- "addvl x13, x13, #1\n"
- "ldp x12, x11, [x19, #0x0]\n"
- "cntw x10\n"
- ".inst 0xa040c1a0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x13]\n"
- "addvl x13, x13, #4\n"
- "ldp x9, x28, [x19, #0x10]\n"
- "mov x27, #0x0\n"
+ "ld1w { z18.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "cntw x11\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "mov x28, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c1a4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x13]\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "addvl x13, x13, #4\n"
- "cmp x10, %x[n_channels]\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "addvl x14, x14, #4\n"
+ "cmp x11, %x[n_channels]\n"
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x24, x21, [x14, #0x10]\n"
+ "ldp x25, x22, [x15, #0x10]\n"
"ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x23, XZR, x10\n"
- "ldr x22, [x14, #0x20]\n"
- "ld1w { z8.s }, p3/Z, [x13]\n"
- "addvl x13, x13, #1\n"
- "ld1w { z9.s }, p2/Z, [x26, x27, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "sub x24, XZR, x11\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x28, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
"movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x14, #0x28]\n"
- "whilelt p1.s, x10, %x[n_channels]\n"
+ "ldr x22, [x15, #0x28]\n"
+ "whilelt p1.s, x11, %x[n_channels]\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ldr x20, [x14, #0x30]\n"
+ "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x19, [x14, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x14, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x27, LSL #2]\n"
- "ldr x26, [x14, #0x40]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x27, [x15, #0x40]\n"
"fmla z30.s, p3/M, z6.s, z9.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x27, LSL #2]\n"
- "ldr x24, [x14, #0x50]\n"
+ "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x21, [x14, #0x58]\n"
- "ld1w { z18.s }, p3/Z, [x13]\n"
+ "ldr x22, [x15, #0x58]\n"
+ "ld1w { z18.s }, p3/Z, [x14]\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ldr x22, [x14, #0x60]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ldr x21, [x14, #0x68]\n"
+ "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x14, #0x70]\n"
- "addvl x13, x13, #1\n"
+ "ldr x21, [x15, #0x70]\n"
+ "addvl x14, x14, #1\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ldr x19, [x14, #0x78]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "incw x23\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "incw x24\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ldp x24, x21, [x14, #0x10]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldp x25, x22, [x15, #0x10]\n"
"fmla z30.s, p3/M, z3.s, z9.s\n"
"fmla z31.s, p3/M, z5.s, z10.s\n"
- "ldr x22, [x14, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x22, x10, LSL #2]\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x11, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z7.s, z11.s\n"
"fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x27, LSL #2]\n"
- "incw x27\n"
+ "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "incw x28\n"
"fmla z28.s, p3/M, z6.s, z9.s\n"
"fmla z29.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x26, x10, LSL #2]\n"
- "whilelt p2.s, x27, %x[n_channels]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x11, LSL #2]\n"
+ "whilelt p2.s, x28, %x[n_channels]\n"
"fmla z30.s, p3/M, z8.s, z12.s\n"
"fmla z31.s, p3/M, z7.s, z12.s\n"
- "ld1w { z10.s }, p1/Z, [x25, x10, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x24, x10, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x25, x11, LSL #2]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x12, x23, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x21, x10, LSL #2]\n"
- "incw x10\n"
- "cmp x10, %x[n_channels]\n"
- "st1w { z29.s }, p0, [x11, x23, LSL #2]\n"
- ".inst 0xa040c1a0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x13]\n"
- "addvl x13, x13, #4\n"
- "st1w { z30.s }, p0, [x9, x23, LSL #2]\n"
- ".inst 0xa040c1a4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x13]\n"
- "addvl x13, x13, #4\n"
- "st1w { z31.s }, p0, [x28, x23, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x13]\n"
- "addvl x13, x13, #1\n"
+ "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x22, x11, LSL #2]\n"
+ "incw x11\n"
+ "cmp x11, %x[n_channels]\n"
+ "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
"blt 1b\n"
"2:" // Channel tail
"movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x14, #0x28]\n"
- "incw x23\n"
+ "ldr x22, [x15, #0x28]\n"
+ "incw x24\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ldr x20, [x14, #0x30]\n"
+ "ld1w { z9.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x30]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x19, [x14, #0x38]\n"
- "ld1w { z11.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x25, [x14, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x27, LSL #2]\n"
- "ldr x26, [x14, #0x40]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x27, [x15, #0x40]\n"
"fmla z30.s, p3/M, z6.s, z9.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x27, LSL #2]\n"
- "ldr x24, [x14, #0x50]\n"
+ "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x21, [x14, #0x58]\n"
+ "ldr x22, [x15, #0x58]\n"
"mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "ldr x22, [x14, #0x60]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ldr x23, [x15, #0x60]\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ldr x21, [x14, #0x68]\n"
+ "ld1w { z12.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x22, [x15, #0x68]\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x14, #0x70]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z29.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ldr x19, [x14, #0x78]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z29.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x28, LSL #2]\n"
"fmla z30.s, p3/M, z3.s, z9.s\n"
"fmla z31.s, p3/M, z5.s, z10.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"fmla z30.s, p3/M, z7.s, z11.s\n"
"fmla z31.s, p3/M, z6.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x27, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x28, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z9.s\n"
"fmla z29.s, p3/M, z8.s, z10.s\n"
"fmla z30.s, p3/M, z8.s, z12.s\n"
"fmla z31.s, p3/M, z7.s, z12.s\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x12, x23, LSL #2]\n"
- "st1w { z29.s }, p0, [x11, x23, LSL #2]\n"
- "st1w { z30.s }, p0, [x9, x23, LSL #2]\n"
- "st1w { z31.s }, p0, [x28, x23, LSL #2]\n"
+ "st1w { z28.s }, p0, [x13, x24, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x24, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 8ff0fe4dff..ce0ae29756 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,105 +94,105 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mov x3, #0x0\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x21, #0x3\n"
+ "mov x22, #0x3\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x19, x2, x20\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "madd x19, x3, x4, x19\n" // offset += tile_j * ld_input_col
- "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x20, LSL #2\n"
- "add x7, x6, x20, LSL #2\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
"add x8, x4, x4\n"
"ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x20, LSL #2\n"
+ "add x16, x7, x21, LSL #2\n"
"add x15, x8, x4\n"
- "add x14, x16, x20, LSL #2\n"
+ "add x14, x16, x21, LSL #2\n"
"add x13, x15, x4\n"
"cbnz x3, 2f\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x20, x19, x3\n"
- "sub x20, x20, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
"lsl x12, %x[n_channels], #0x2\n"
- "mov x19, #0xc\n"
- "and x20, x20, #0x3fffff\n"
- "mul x19, x19, x4\n"
- "orr x12, x12, x20, LSL #22\n"
- "orr x12, x12, x19, LSL #38\n"
- "add x25, x7, x8, LSL #2\n"
- "add x24, x5, x13, LSL #2\n"
- "add x23, x6, x8, LSL #2\n"
- "add x22, x14, x13, LSL #2\n"
- "add x21, x7, x4, LSL #2\n"
- "add x20, x5, x4, LSL #2\n"
- "add x19, x5, x15, LSL #2\n"
- "add x11, x7, x15, LSL #2\n"
- "add x10, x6, x13, LSL #2\n"
- "add x9, x16, x8, LSL #2\n"
- "add x28, x16, x13, LSL #2\n"
- "add x27, x14, x4, LSL #2\n"
- "add x26, x6, x4, LSL #2\n"
- ".inst 0xf8ac4b3a // rprfm pldonce, x25, [x12]\n"
- "add x25, x6, x15, LSL #2\n"
- ".inst 0xf8ac48ba // rprfm pldonce, x5, [x12]\n"
- ".inst 0xf8ac4b1a // rprfm pldonce, x24, [x12]\n"
- "add x24, x14, x15, LSL #2\n"
- ".inst 0xf8ac49da // rprfm pldonce, x14, [x12]\n"
- ".inst 0xf8ac4afa // rprfm pldonce, x23, [x12]\n"
- "add x23, x16, x4, LSL #2\n"
- ".inst 0xf8ac4ada // rprfm pldonce, x22, [x12]\n"
- "add x22, x5, x8, LSL #2\n"
- ".inst 0xf8ac4aba // rprfm pldonce, x21, [x12]\n"
- "add x21, x16, x15, LSL #2\n"
- ".inst 0xf8ac4a9a // rprfm pldonce, x20, [x12]\n"
- "add x20, x7, x13, LSL #2\n"
- ".inst 0xf8ac4a7a // rprfm pldonce, x19, [x12]\n"
- "add x19, x14, x8, LSL #2\n"
- ".inst 0xf8ac497a // rprfm pldonce, x11, [x12]\n"
- ".inst 0xf8ac48da // rprfm pldonce, x6, [x12]\n"
- ".inst 0xf8ac495a // rprfm pldonce, x10, [x12]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x16, [x12]\n"
- ".inst 0xf8ac493a // rprfm pldonce, x9, [x12]\n"
- ".inst 0xf8ac4b9a // rprfm pldonce, x28, [x12]\n"
- ".inst 0xf8ac4b7a // rprfm pldonce, x27, [x12]\n"
- ".inst 0xf8ac4b5a // rprfm pldonce, x26, [x12]\n"
- ".inst 0xf8ac4b3a // rprfm pldonce, x25, [x12]\n"
- ".inst 0xf8ac4b1a // rprfm pldonce, x24, [x12]\n"
- ".inst 0xf8ac4afa // rprfm pldonce, x23, [x12]\n"
- ".inst 0xf8ac4ada // rprfm pldonce, x22, [x12]\n"
- ".inst 0xf8ac4aba // rprfm pldonce, x21, [x12]\n"
- ".inst 0xf8ac48fa // rprfm pldonce, x7, [x12]\n"
- ".inst 0xf8ac4a9a // rprfm pldonce, x20, [x12]\n"
- ".inst 0xf8ac4a7a // rprfm pldonce, x19, [x12]\n"
+ "mov x20, #0xc\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x27, x7, x8, LSL #2\n"
+ "add x26, x5, x13, LSL #2\n"
+ "add x25, x6, x8, LSL #2\n"
+ "add x24, x14, x13, LSL #2\n"
+ "add x23, x7, x4, LSL #2\n"
+ "add x22, x5, x4, LSL #2\n"
+ "add x21, x5, x15, LSL #2\n"
+ "add x20, x7, x15, LSL #2\n"
+ "add x11, x6, x13, LSL #2\n"
+ "add x10, x16, x8, LSL #2\n"
+ "add x9, x16, x13, LSL #2\n"
+ "add x28, x14, x4, LSL #2\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x6, x4, LSL #2\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x6, x15, LSL #2\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x14, x15, LSL #2\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x16, x4, LSL #2\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x5, x8, LSL #2\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x16, x15, LSL #2\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x7, x13, LSL #2\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x14, x8, LSL #2\n"
+ ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_output_row
- "mov x19, #0x3\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x3\n"
"ld1w { z18.s }, p3/Z, [x17]\n"
- "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x20, x3, x26, x20\n" // offset += tile_j * ld_output_col
- "mul x20, x20, x19\n" // offset *= output_tile_size
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x27, x21\n" // offset += tile_j * ld_output_col
+ "mul x21, x21, x20\n" // offset *= output_tile_size
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
"addvl x17, x17, #1\n"
- "add x25, x25, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x26, x26, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "cntw x24\n"
+ "cntw x25\n"
"addvl x17, x17, #4\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "add x23, x25, x21, LSL #2\n"
+ "add x24, x26, x22, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
- "cmp x24, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
- "add x22, x23, x21, LSL #2\n"
- "add x21, x26, x26\n"
+ "add x23, x24, x22, LSL #2\n"
+ "add x22, x27, x27\n"
"ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x24\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x25\n"
"ld1w { z10.s }, p2/Z, [x5]\n"
"ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
"addvl x17, x17, #1\n"
@@ -202,15 +202,15 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"3:" // Tile loop: Channel loop
"movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
"movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x24, %x[n_channels]\n"
- "incw x20\n"
+ "whilelt p1.s, x25, %x[n_channels]\n"
+ "incw x21\n"
"movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "incw x24\n"
+ "incw x25\n"
"mov p0.b, p2.b\n"
"movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
"movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "incw x19\n"
+ "incw x20\n"
"movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
"fmla z23.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
@@ -309,7 +309,7 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmla z27.s, p3/M, z8.s, z13.s\n"
"ld1w { z13.s }, p2/Z, [x14, x8, LSL #2]\n"
"fmla z26.s, p3/M, z3.s, z12.s\n"
- "whilelt p2.s, x20, %x[n_channels]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
@@ -317,7 +317,7 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmla z29.s, p3/M, z8.s, z13.s\n"
"fmla z30.s, p3/M, z7.s, z13.s\n"
"addvl x14, x14, #1\n"
- "cmp x24, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
"fmla z31.s, p3/M, z6.s, z13.s\n"
"fmax z23.s, p3/M, z23.s, z17.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
@@ -327,21 +327,21 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
"ld1w { z12.s }, p1/Z, [x14]\n"
- "st1w { z23.s }, p0, [x25]\n"
+ "st1w { z23.s }, p0, [x26]\n"
"ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
- "st1w { z24.s }, p0, [x25, x26, LSL #2]\n"
- "st1w { z25.s }, p0, [x25, x21, LSL #2]\n"
- "addvl x25, x25, #1\n"
+ "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "addvl x26, x26, #1\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z26.s }, p0, [x23]\n"
- "st1w { z27.s }, p0, [x23, x26, LSL #2]\n"
- "st1w { z28.s }, p0, [x23, x21, LSL #2]\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
- "st1w { z29.s }, p0, [x22]\n"
- "st1w { z30.s }, p0, [x22, x26, LSL #2]\n"
- "st1w { z31.s }, p0, [x22, x21, LSL #2]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
"movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
@@ -351,26 +351,26 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x20, x2, #0x1\n"
+ "add x21, x2, #0x1\n"
"movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
"movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x3, x19\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x3, x20\n"
"movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
"fmla z23.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
"ld1w { z11.s }, p2/Z, [x7, x4, LSL #2]\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "csel x2, x2, x20, LT\n"
+ "csel x2, x2, x21, LT\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
"csel x3, x3, XZR, LT\n"
"fmla z23.s, p3/M, z5.s, z13.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "cmp x2, x19\n"
+ "cmp x2, x20\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
@@ -459,21 +459,21 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmax z23.s, p3/M, z23.s, z17.s\n"
"fmin z23.s, p3/M, z23.s, z16.s\n"
".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z23.s }, p0, [x25]\n"
+ "st1w { z23.s }, p0, [x26]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p0, [x25, x26, LSL #2]\n"
- "st1w { z25.s }, p0, [x25, x21, LSL #2]\n"
- "st1w { z26.s }, p0, [x23]\n"
- "st1w { z27.s }, p0, [x23, x26, LSL #2]\n"
- "st1w { z28.s }, p0, [x23, x21, LSL #2]\n"
- "st1w { z29.s }, p0, [x22]\n"
- "st1w { z30.s }, p0, [x22, x26, LSL #2]\n"
- "st1w { z31.s }, p0, [x22, x21, LSL #2]\n"
+ "st1w { z24.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "st1w { z27.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index ab910c144d..fd648a392f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,354 +87,354 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
- "addvl x16, x16, #1\n"
- "ldp x14, x13, [x15, #0x0]\n"
- "ldp x12, x11, [x15, #0x10]\n"
- "cntw x10\n"
- ".inst 0xa040c200 // ld1w { z0.s-z3.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "ldr x9, [x15, #0x20]\n"
- "mov x28, #0x0\n"
+ "ld1w { z18.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ldp x15, x14, [x16, #0x0]\n"
+ "ldp x13, x12, [x16, #0x10]\n"
+ "cntw x11\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "ldr x10, [x16, #0x20]\n"
+ "mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c204 // ld1w { z4.s-z7.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "cmp x10, %x[n_channels]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x11, %x[n_channels]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x26, XZR, x10\n"
- "ld1w { z8.s }, p3/Z, [x16]\n"
- "addvl x16, x16, #1\n"
- "ld1w { z9.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x13, x28, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x12, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x11, x28, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "sub x27, XZR, x11\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ld1w { z9.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
"movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
"movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x25, [x15, #0x30]\n"
- "incw x26\n"
+ "ldr x26, [x16, #0x30]\n"
+ "incw x27\n"
"movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
"fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x15, #0x38]\n"
+ "ldr x25, [x16, #0x38]\n"
"mov p1.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
"movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x23, [x15, #0x28]\n"
- "whilelt p0.s, x10, %x[n_channels]\n"
+ "ldr x24, [x16, #0x28]\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
"movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
"movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x13, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "ldr x14, [x16, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x14, [x15, #0x40]\n"
+ "ldr x15, [x16, #0x40]\n"
"fmla z23.s, p3/M, z5.s, z13.s\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x12, [x15, #0x50]\n"
+ "ldr x13, [x16, #0x50]\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x11, [x15, #0x58]\n"
+ "ldr x12, [x16, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x9, [x15, #0x60]\n"
+ "ldr x10, [x16, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x25, [x15, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x15, #0x68]\n"
+ "ldr x24, [x16, #0x68]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ldr x24, [x15, #0x78]\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ldr x25, [x16, #0x78]\n"
"fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x14, [x15, #0x80]\n"
- "ld1w { z18.s }, p3/Z, [x16]\n"
+ "ldr x15, [x16, #0x80]\n"
+ "ld1w { z18.s }, p3/Z, [x17]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x13, [x15, #0x88]\n"
- "addvl x16, x16, #1\n"
+ "ldr x14, [x16, #0x88]\n"
+ "addvl x17, x17, #1\n"
"fmla z29.s, p3/M, z1.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x28, LSL #2]\n"
- "ldr x12, [x15, #0x90]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "ldr x13, [x16, #0x90]\n"
"fmla z24.s, p3/M, z2.s, z12.s\n"
"fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
- "ldr x11, [x15, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ldr x12, [x16, #0x98]\n"
+ "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
"fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x9, [x15, #0xa0]\n"
+ "ldr x10, [x16, #0xa0]\n"
"fmla z26.s, p3/M, z0.s, z11.s\n"
"fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x22, [x27, #0x0]\n"
+ "ldr x23, [x28, #0x0]\n"
"fmla z24.s, p3/M, z8.s, z10.s\n"
"fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x21, [x27, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
"fmla z31.s, p3/M, z1.s, z10.s\n"
"fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ldr x23, [x15, #0xa8]\n"
+ "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x24, [x16, #0xa8]\n"
"fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ldr x14, [x15, #0xc0]\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ldr x15, [x16, #0xc0]\n"
"fmla z28.s, p3/M, z6.s, z10.s\n"
"fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x27, #0x10]\n"
+ "ldr x21, [x28, #0x10]\n"
"fmla z23.s, p3/M, z3.s, z11.s\n"
"fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x25, [x15, #0xb0]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
"fmla z29.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ldr x24, [x15, #0xb8]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x25, [x16, #0xb8]\n"
"fmla z26.s, p3/M, z8.s, z10.s\n"
"fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x19, [x27, #0x18]\n"
+ "ldr x20, [x28, #0x18]\n"
"fmla z30.s, p3/M, z6.s, z13.s\n"
"fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
"fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
"fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x28, LSL #2]\n"
- "ldr x9, [x15, #0x20]\n"
+ "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
+ "ldr x10, [x16, #0x20]\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
"fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
"fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
"fmla z31.s, p3/M, z4.s, z13.s\n"
"fmla z24.s, p3/M, z1.s, z11.s\n"
"fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmax z23.s, p3/M, z23.s, z17.s\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z30.s, p3/M, z5.s, z13.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ldp x14, x13, [x15, #0x0]\n"
+ "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ldp x15, x14, [x16, #0x0]\n"
"fmla z26.s, p3/M, z3.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
- "ldp x12, x11, [x15, #0x10]\n"
- "incw x28\n"
+ "ldp x13, x12, [x16, #0x10]\n"
+ "incw x9\n"
"fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x22, x26, LSL #2]\n"
- "ldr x22, [x27, #0x20]\n"
+ "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x23, [x28, #0x20]\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
"fmla z29.s, p3/M, z8.s, z13.s\n"
"fmla z30.s, p3/M, z7.s, z13.s\n"
- "ld1w { z9.s }, p0/Z, [x14, x10, LSL #2]\n"
- "whilelt p2.s, x28, %x[n_channels]\n"
+ "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
"fmla z31.s, p3/M, z6.s, z13.s\n"
".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x21, x26, LSL #2]\n"
- "ldr x21, [x27, #0x28]\n"
- "st1w { z25.s }, p1, [x20, x26, LSL #2]\n"
- "ldr x20, [x27, #0x30]\n"
- "ld1w { z10.s }, p0/Z, [x13, x10, LSL #2]\n"
+ "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x19, x26, LSL #2]\n"
- "ldr x19, [x27, #0x38]\n"
- "ld1w { z11.s }, p0/Z, [x12, x10, LSL #2]\n"
- "st1w { z27.s }, p1, [x22, x26, LSL #2]\n"
- "ldr x22, [x27, #0x40]\n"
- "ld1w { z12.s }, p0/Z, [x11, x10, LSL #2]\n"
- "ld1w { z13.s }, p0/Z, [x9, x10, LSL #2]\n"
- "incw x10\n"
- "cmp x10, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x21, x26, LSL #2]\n"
- ".inst 0xa040c200 // ld1w { z0.s-z3.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "st1w { z29.s }, p1, [x20, x26, LSL #2]\n"
- ".inst 0xa040c204 // ld1w { z4.s-z7.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "st1w { z30.s }, p1, [x19, x26, LSL #2]\n"
- "st1w { z31.s }, p1, [x22, x26, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x16]\n"
- "addvl x16, x16, #1\n"
+ "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
+ "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
+ "ld1w { z13.s }, p0/Z, [x10, x11, LSL #2]\n"
+ "incw x11\n"
+ "cmp x11, %x[n_channels]\n"
+ "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
+ "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
"blt 1b\n"
"2:" // Channel tail
"movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
"movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x25, [x15, #0x30]\n"
- "incw x26\n"
+ "ldr x26, [x16, #0x30]\n"
+ "incw x27\n"
"movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
"fmla z23.s, p3/M, z0.s, z10.s\n"
- "ldr x24, [x15, #0x38]\n"
+ "ldr x25, [x16, #0x38]\n"
"mov p1.b, p2.b\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
"movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x23, [x15, #0x28]\n"
+ "ldr x24, [x16, #0x28]\n"
"movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
"movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x13, [x15, #0x48]\n"
- "ld1w { z10.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "ldr x14, [x16, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
"movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x14, [x15, #0x40]\n"
+ "ldr x15, [x16, #0x40]\n"
"fmla z23.s, p3/M, z5.s, z13.s\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
- "ldr x12, [x15, #0x50]\n"
+ "ldr x13, [x16, #0x50]\n"
"movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x11, [x15, #0x58]\n"
+ "ldr x12, [x16, #0x58]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x9, [x15, #0x60]\n"
+ "ldr x10, [x16, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x25, [x15, #0x70]\n"
- "ld1w { z12.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
"movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
"fmla z23.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x15, #0x68]\n"
+ "ldr x24, [x16, #0x68]\n"
"fmla z24.s, p3/M, z0.s, z13.s\n"
"fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ldr x24, [x15, #0x78]\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ldr x25, [x16, #0x78]\n"
"fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "ldr x14, [x15, #0x80]\n"
+ "ldr x15, [x16, #0x80]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
"fmla z28.s, p3/M, z4.s, z10.s\n"
- "ldr x13, [x15, #0x88]\n"
+ "ldr x14, [x16, #0x88]\n"
"fmla z29.s, p3/M, z1.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x28, LSL #2]\n"
- "ldr x12, [x15, #0x90]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "ldr x13, [x16, #0x90]\n"
"fmla z24.s, p3/M, z2.s, z12.s\n"
"fmla z25.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
- "ldr x11, [x15, #0x98]\n"
- "ld1w { z12.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ldr x12, [x16, #0x98]\n"
+ "ld1w { z12.s }, p2/Z, [x10, x9, LSL #2]\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
"fmla z30.s, p3/M, z2.s, z10.s\n"
- "ldr x9, [x15, #0xa0]\n"
+ "ldr x10, [x16, #0xa0]\n"
"fmla z26.s, p3/M, z0.s, z11.s\n"
"fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x22, [x27, #0x0]\n"
+ "ldr x23, [x28, #0x0]\n"
"fmla z24.s, p3/M, z8.s, z10.s\n"
"fmla z25.s, p3/M, z7.s, z10.s\n"
- "ldr x21, [x27, #0x8]\n"
+ "ldr x22, [x28, #0x8]\n"
"fmla z31.s, p3/M, z1.s, z10.s\n"
"fmla z29.s, p3/M, z3.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ldr x23, [x15, #0xa8]\n"
+ "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x24, [x16, #0xa8]\n"
"fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ldr x14, [x15, #0xc0]\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "ldr x15, [x16, #0xc0]\n"
"fmla z28.s, p3/M, z6.s, z10.s\n"
"fmla z30.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x27, #0x10]\n"
+ "ldr x21, [x28, #0x10]\n"
"fmla z23.s, p3/M, z3.s, z11.s\n"
"fmla z25.s, p3/M, z5.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x25, [x15, #0xb0]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
"fmla z29.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z3.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ldr x24, [x15, #0xb8]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x25, [x16, #0xb8]\n"
"fmla z26.s, p3/M, z8.s, z10.s\n"
"fmla z28.s, p3/M, z8.s, z11.s\n"
- "ldr x19, [x27, #0x18]\n"
+ "ldr x20, [x28, #0x18]\n"
"fmla z30.s, p3/M, z6.s, z13.s\n"
"fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
"fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
"fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z30.s, p3/M, z8.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x10, x9, LSL #2]\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
"fmla z30.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
"fmla z31.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x24, x9, LSL #2]\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
"fmla z31.s, p3/M, z4.s, z13.s\n"
"fmla z24.s, p3/M, z1.s, z11.s\n"
"fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmax z23.s, p3/M, z23.s, z17.s\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z30.s, p3/M, z5.s, z13.s\n"
"fmla z29.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x15, x9, LSL #2]\n"
"fmla z26.s, p3/M, z3.s, z12.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z23.s }, p1, [x22, x26, LSL #2]\n"
- "ldr x22, [x27, #0x20]\n"
+ "st1w { z23.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x23, [x28, #0x20]\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
"fmla z29.s, p3/M, z8.s, z13.s\n"
"fmla z30.s, p3/M, z7.s, z13.s\n"
"fmla z31.s, p3/M, z6.s, z13.s\n"
".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
- "st1w { z24.s }, p1, [x21, x26, LSL #2]\n"
- "ldr x21, [x27, #0x28]\n"
- "st1w { z25.s }, p1, [x20, x26, LSL #2]\n"
- "ldr x20, [x27, #0x30]\n"
+ "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
+ "ldr x21, [x28, #0x30]\n"
".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n"
- "st1w { z26.s }, p1, [x19, x26, LSL #2]\n"
- "ldr x19, [x27, #0x38]\n"
- "st1w { z27.s }, p1, [x22, x26, LSL #2]\n"
- "ldr x22, [x27, #0x40]\n"
- "st1w { z28.s }, p1, [x21, x26, LSL #2]\n"
- "st1w { z29.s }, p1, [x20, x26, LSL #2]\n"
- "st1w { z30.s }, p1, [x19, x26, LSL #2]\n"
- "st1w { z31.s }, p1, [x22, x26, LSL #2]\n"
+ "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "st1w { z27.s }, p1, [x23, x27, LSL #2]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
+ "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
+ "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
+ "st1w { z31.s }, p1, [x23, x27, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 8ec7bcca7e..5380567d36 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,131 +94,131 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"mov x3, #0x0\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x21, #0x4\n"
+ "mov x22, #0x4\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x19, x2, x20\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "madd x19, x3, x4, x19\n" // offset += tile_j * ld_input_col
- "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x20, LSL #2\n"
- "add x7, x6, x20, LSL #2\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
"add x8, x4, x4\n"
"ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x20, LSL #2\n"
+ "add x16, x7, x21, LSL #2\n"
"add x15, x8, x4\n"
- "add x14, x16, x20, LSL #2\n"
+ "add x14, x16, x21, LSL #2\n"
"add x13, x15, x4\n"
- "add x12, x14, x20, LSL #2\n"
+ "add x12, x14, x21, LSL #2\n"
"add x11, x13, x4\n"
"cbnz x3, 2f\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x20, x19, x3\n"
- "sub x20, x20, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
- "mov x19, #0x10\n"
- "and x20, x20, #0x3fffff\n"
- "mul x19, x19, x4\n"
- "orr x10, x10, x20, LSL #22\n"
- "orr x10, x10, x19, LSL #38\n"
- "add x26, x7, x8, LSL #2\n"
- "add x25, x5, x11, LSL #2\n"
- "add x24, x7, x15, LSL #2\n"
- "add x23, x12, x11, LSL #2\n"
- "add x22, x16, x8, LSL #2\n"
- "add x21, x5, x4, LSL #2\n"
- "add x20, x5, x13, LSL #2\n"
- "add x19, x16, x15, LSL #2\n"
- "add x9, x6, x11, LSL #2\n"
- "add x28, x6, x8, LSL #2\n"
- "add x27, x14, x11, LSL #2\n"
- ".inst 0xf8aa4b5a // rprfm pldonce, x26, [x10]\n"
- "add x26, x6, x15, LSL #2\n"
- ".inst 0xf8aa48ba // rprfm pldonce, x5, [x10]\n"
- ".inst 0xf8aa4b3a // rprfm pldonce, x25, [x10]\n"
- "add x25, x12, x4, LSL #2\n"
- ".inst 0xf8aa4b1a // rprfm pldonce, x24, [x10]\n"
- "add x24, x7, x4, LSL #2\n"
- ".inst 0xf8aa499a // rprfm pldonce, x12, [x10]\n"
- ".inst 0xf8aa4afa // rprfm pldonce, x23, [x10]\n"
- "add x23, x12, x13, LSL #2\n"
- ".inst 0xf8aa4ada // rprfm pldonce, x22, [x10]\n"
- "add x22, x7, x13, LSL #2\n"
- ".inst 0xf8aa4aba // rprfm pldonce, x21, [x10]\n"
- "add x21, x5, x8, LSL #2\n"
- ".inst 0xf8aa4a9a // rprfm pldonce, x20, [x10]\n"
- "add x20, x16, x4, LSL #2\n"
- ".inst 0xf8aa4a7a // rprfm pldonce, x19, [x10]\n"
- "add x19, x5, x15, LSL #2\n"
- ".inst 0xf8aa48da // rprfm pldonce, x6, [x10]\n"
- ".inst 0xf8aa493a // rprfm pldonce, x9, [x10]\n"
- "add x9, x16, x13, LSL #2\n"
- ".inst 0xf8aa49da // rprfm pldonce, x14, [x10]\n"
- ".inst 0xf8aa4b9a // rprfm pldonce, x28, [x10]\n"
- "add x28, x7, x11, LSL #2\n"
- ".inst 0xf8aa4b7a // rprfm pldonce, x27, [x10]\n"
- "add x27, x14, x8, LSL #2\n"
- ".inst 0xf8aa4b5a // rprfm pldonce, x26, [x10]\n"
- "add x26, x16, x11, LSL #2\n"
- ".inst 0xf8aa4b3a // rprfm pldonce, x25, [x10]\n"
- "add x25, x12, x8, LSL #2\n"
- ".inst 0xf8aa4b1a // rprfm pldonce, x24, [x10]\n"
- "add x24, x14, x15, LSL #2\n"
- ".inst 0xf8aa4afa // rprfm pldonce, x23, [x10]\n"
- "add x23, x12, x15, LSL #2\n"
- ".inst 0xf8aa4ada // rprfm pldonce, x22, [x10]\n"
- "add x22, x6, x4, LSL #2\n"
- ".inst 0xf8aa4aba // rprfm pldonce, x21, [x10]\n"
- "add x21, x6, x13, LSL #2\n"
- ".inst 0xf8aa4a9a // rprfm pldonce, x20, [x10]\n"
- "add x20, x14, x4, LSL #2\n"
- ".inst 0xf8aa4a7a // rprfm pldonce, x19, [x10]\n"
- "add x19, x14, x13, LSL #2\n"
- ".inst 0xf8aa48fa // rprfm pldonce, x7, [x10]\n"
- ".inst 0xf8aa493a // rprfm pldonce, x9, [x10]\n"
- ".inst 0xf8aa4b9a // rprfm pldonce, x28, [x10]\n"
- ".inst 0xf8aa4a1a // rprfm pldonce, x16, [x10]\n"
- ".inst 0xf8aa4b7a // rprfm pldonce, x27, [x10]\n"
- ".inst 0xf8aa4b5a // rprfm pldonce, x26, [x10]\n"
- ".inst 0xf8aa4b3a // rprfm pldonce, x25, [x10]\n"
- ".inst 0xf8aa4b1a // rprfm pldonce, x24, [x10]\n"
- ".inst 0xf8aa4afa // rprfm pldonce, x23, [x10]\n"
- ".inst 0xf8aa4ada // rprfm pldonce, x22, [x10]\n"
- ".inst 0xf8aa4aba // rprfm pldonce, x21, [x10]\n"
- ".inst 0xf8aa4a9a // rprfm pldonce, x20, [x10]\n"
- ".inst 0xf8aa4a7a // rprfm pldonce, x19, [x10]\n"
+ "mov x20, #0x10\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x10, x10, x21, LSL #22\n"
+ "orr x10, x10, x20, LSL #38\n"
+ "add x9, x7, x8, LSL #2\n"
+ "add x28, x5, x11, LSL #2\n"
+ "add x27, x7, x15, LSL #2\n"
+ "add x26, x12, x11, LSL #2\n"
+ "add x25, x16, x8, LSL #2\n"
+ "add x24, x5, x4, LSL #2\n"
+ "add x23, x5, x13, LSL #2\n"
+ "add x22, x16, x15, LSL #2\n"
+ "add x21, x6, x11, LSL #2\n"
+ "add x20, x6, x8, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x14, x11, LSL #2\n"
+ ".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x6, x15, LSL #2\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x12, x4, LSL #2\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x7, x4, LSL #2\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x12, x13, LSL #2\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x7, x13, LSL #2\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x5, x8, LSL #2\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x16, x4, LSL #2\n"
+ ".inst 0xf8aa48da // rprfm pldonce, x10, [x6]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x5, x15, LSL #2\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x16, x13, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x7, x11, LSL #2\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x14, x8, LSL #2\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x16, x11, LSL #2\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x12, x8, LSL #2\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x14, x15, LSL #2\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x12, x15, LSL #2\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x6, x4, LSL #2\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x6, x13, LSL #2\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x14, x4, LSL #2\n"
+ ".inst 0xf8aa48fa // rprfm pldonce, x10, [x7]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x14, x13, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_output_row
- "mov x19, #0x4\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x4\n"
"ld1w { z15.s }, p3/Z, [x17]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x20, x3, x28, x20\n" // offset += tile_j * ld_output_col
- "mul x20, x20, x19\n" // offset *= output_tile_size
+ "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x9, x21\n" // offset += tile_j * ld_output_col
+ "mul x21, x21, x20\n" // offset *= output_tile_size
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x27, x27, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x28, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"addvl x17, x17, #1\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "add x26, x27, x21, LSL #2\n"
- "cntw x25\n"
+ "add x27, x28, x22, LSL #2\n"
+ "cntw x26\n"
"ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"addvl x17, x17, #4\n"
- "add x24, x26, x21, LSL #2\n"
+ "add x25, x27, x22, LSL #2\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "add x23, x28, x28\n"
+ "add x24, x9, x9\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
"addvl x17, x17, #4\n"
- "cmp x25, %x[n_channels]\n"
+ "cmp x26, %x[n_channels]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
- "add x22, x24, x21, LSL #2\n"
- "add x21, x23, x28\n"
+ "add x23, x25, x22, LSL #2\n"
+ "add x22, x24, x9\n"
"ld1w { z10.s }, p2/Z, [x5]\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x25\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x26\n"
"ld1w { z11.s }, p2/Z, [x5, x11, LSL #2]\n"
"ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
"addvl x17, x17, #1\n"
@@ -226,15 +226,15 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"3:" // Tile loop: Channel loop
"movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
"movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x25, %x[n_channels]\n"
- "incw x20\n"
+ "whilelt p1.s, x26, %x[n_channels]\n"
+ "incw x21\n"
"movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
"movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "incw x25\n"
+ "incw x26\n"
"mov p0.b, p2.b\n"
"movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
- "incw x19\n"
+ "incw x20\n"
"movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
"movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
"movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
@@ -261,15 +261,10 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ld1w { z15.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x6]\n"
"fmla z16.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
@@ -281,25 +276,28 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z25.s, p3/M, z5.s, z10.s\n"
"fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
"fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
"ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
"ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
"fmla z27.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x6]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
"ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
"fmla z16.s, p3/M, z5.s, z10.s\n"
@@ -361,7 +359,6 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z29.s, p3/M, z4.s, z11.s\n"
"fmla z30.s, p3/M, z3.s, z11.s\n"
"fmla z19.s, p3/M, z8.s, z12.s\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
"fmla z27.s, p3/M, z2.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
@@ -401,23 +398,25 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z20.s, p3/M, z1.s, z10.s\n"
"fmla z21.s, p3/M, z0.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
- "whilelt p2.s, x20, %x[n_channels]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
"fmla z19.s, p3/M, z4.s, z11.s\n"
- "cmp x25, %x[n_channels]\n"
- "addvl x14, x14, #1\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+ "cmp x26, %x[n_channels]\n"
+ "addvl x14, x14, #1\n"
"fmla z24.s, p3/M, z7.s, z12.s\n"
"fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
"fmla z28.s, p3/M, z4.s, z12.s\n"
"fmla z29.s, p3/M, z3.s, z12.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
"fmla z26.s, p3/M, z8.s, z10.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
@@ -427,28 +426,29 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ld1w { z10.s }, p1/Z, [x5]\n"
".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x27]\n"
- "st1w { z17.s }, p0, [x27, x28, LSL #2]\n"
+ "st1w { z16.s }, p0, [x28]\n"
+ "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+ "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1w { z18.s }, p0, [x27, x23, LSL #2]\n"
- "st1w { z19.s }, p0, [x27, x21, LSL #2]\n"
+ "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "st1w { z20.s }, p0, [x27]\n"
+ "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
"addvl x27, x27, #1\n"
- "st1w { z20.s }, p0, [x26]\n"
- "st1w { z21.s }, p0, [x26, x28, LSL #2]\n"
- "st1w { z22.s }, p0, [x26, x23, LSL #2]\n"
- "st1w { z23.s }, p0, [x26, x21, LSL #2]\n"
- "addvl x26, x26, #1\n"
- "st1w { z24.s }, p0, [x24]\n"
- "st1w { z25.s }, p0, [x24, x28, LSL #2]\n"
- "st1w { z26.s }, p0, [x24, x23, LSL #2]\n"
- "st1w { z27.s }, p0, [x24, x21, LSL #2]\n"
- "addvl x24, x24, #1\n"
- "st1w { z28.s }, p0, [x22]\n"
- "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x22, x23, LSL #2]\n"
- "st1w { z31.s }, p0, [x22, x21, LSL #2]\n"
- "addvl x22, x22, #1\n"
+ "st1w { z24.s }, p0, [x25]\n"
+ "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
+ "addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
"movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
@@ -458,15 +458,15 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
"movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x20, x2, #0x1\n"
+ "add x21, x2, #0x1\n"
"movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x3, x19\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x3, x20\n"
"movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
"movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x2, x2, x20, LT\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x2, x2, x21, LT\n"
"movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
"movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
"ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
@@ -478,7 +478,7 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z22.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
"ld1w { z11.s }, p2/Z, [x12, x11, LSL #2]\n"
- "cmp x2, x19\n"
+ "cmp x2, x20\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
"ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
@@ -494,13 +494,10 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x6]\n"
"fmla z16.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z0.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x6, x11, LSL #2]\n"
@@ -512,25 +509,28 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z25.s, p3/M, z5.s, z10.s\n"
"fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
"fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z0.s, z9.s\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
"ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
- "fmla z16.s, p3/M, z3.s, z9.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z2.s, z12.s\n"
- "fmla z17.s, p3/M, z4.s, z10.s\n"
"ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
"fmla z27.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x6]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
"ld1w { z11.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
"fmla z16.s, p3/M, z5.s, z10.s\n"
@@ -640,29 +640,29 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z31.s, p3/M, z4.s, z10.s\n"
".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p0, [x27]\n"
+ "st1w { z16.s }, p0, [x28]\n"
".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z17.s }, p0, [x27, x28, LSL #2]\n"
- "st1w { z18.s }, p0, [x27, x23, LSL #2]\n"
- "st1w { z19.s }, p0, [x27, x21, LSL #2]\n"
- "st1w { z20.s }, p0, [x26]\n"
- "st1w { z21.s }, p0, [x26, x28, LSL #2]\n"
- "st1w { z22.s }, p0, [x26, x23, LSL #2]\n"
- "st1w { z23.s }, p0, [x26, x21, LSL #2]\n"
- "st1w { z24.s }, p0, [x24]\n"
- "st1w { z25.s }, p0, [x24, x28, LSL #2]\n"
- "st1w { z26.s }, p0, [x24, x23, LSL #2]\n"
- "st1w { z27.s }, p0, [x24, x21, LSL #2]\n"
- "st1w { z28.s }, p0, [x22]\n"
- "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
- "st1w { z30.s }, p0, [x22, x23, LSL #2]\n"
- "st1w { z31.s }, p0, [x22, x21, LSL #2]\n"
+ "st1w { z17.s }, p0, [x28, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x28, x22, LSL #2]\n"
+ "st1w { z20.s }, p0, [x27]\n"
+ "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+ "st1w { z24.s }, p0, [x25]\n"
+ "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "st1w { z29.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index d99ebb2bb4..d904f68806 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,211 +98,209 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z15.s }, p3/Z, [x16]\n"
- "addvl x16, x16, #1\n"
- "ldp x14, x13, [x15, #0x0]\n"
- "ldp x12, x11, [x15, #0x10]\n"
- "cntw x10\n"
- ".inst 0xa040c200 // ld1w { z0.s-z3.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "mov x9, #0x0\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ldp x15, x14, [x16, #0x0]\n"
+ "ldp x13, x12, [x16, #0x10]\n"
+ "cntw x11\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "mov x10, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c204 // ld1w { z4.s-z7.s }, pn8.b/Z, [x16]\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "addvl x16, x16, #4\n"
- "cmp x10, %x[n_channels]\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x11, %x[n_channels]\n"
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x27, XZR, x10\n"
- "ld1w { z8.s }, p3/Z, [x16]\n"
- "addvl x16, x16, #1\n"
- "ld1w { z9.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "sub x28, XZR, x11\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ld1w { z9.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
"movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
"movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x15, #0x20]\n"
- "incw x27\n"
+ "ldr x27, [x16, #0x20]\n"
+ "incw x28\n"
"movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
"movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x15, #0x30]\n"
+ "ldr x26, [x16, #0x30]\n"
"mov p1.b, p2.b\n"
"movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x24, [x15, #0x28]\n"
+ "ldr x25, [x16, #0x28]\n"
"movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "whilelt p0.s, x10, %x[n_channels]\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
"movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x15, #0x38]\n"
+ "ldr x24, [x16, #0x38]\n"
"movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
"movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x14, [x15, #0x40]\n"
+ "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x15, [x16, #0x40]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
"movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x13, [x15, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x14, [x16, #0x48]\n"
"fmla z22.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x12, [x15, #0x50]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x26, [x15, #0x60]\n"
+ "ldr x27, [x16, #0x60]\n"
"fmla z18.s, p3/M, z7.s, z12.s\n"
"movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x24, [x15, #0x68]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x25, [x16, #0x68]\n"
"fmla z21.s, p3/M, z7.s, z9.s\n"
"fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x11, [x15, #0x58]\n"
+ "ldr x12, [x16, #0x58]\n"
"movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
"movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x25, [x15, #0x70]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
"movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
"fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ldr x23, [x15, #0x78]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ldr x24, [x16, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x15, #0x80]\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ldr x13, [x15, #0x88]\n"
- "ld1w { z15.s }, p3/Z, [x16]\n"
+ "ldr x15, [x16, #0x80]\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x22, [x28, #0x0]\n"
- "addvl x16, x16, #1\n"
+ "ldr x14, [x16, #0x88]\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z16.s, p3/M, z1.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0x90]\n"
+ "ldr x23, [x9, #0x0]\n"
"fmla z17.s, p3/M, z0.s, z12.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ldr x22, [x9, #0x8]\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
"fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "ldr x11, [x15, #0x98]\n"
+ "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x90]\n"
"fmla z21.s, p3/M, z8.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x15, #0xa0]\n"
+ "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x27, [x16, #0xa0]\n"
"fmla z22.s, p3/M, z7.s, z10.s\n"
"fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x28, #0x8]\n"
+ "ldr x21, [x9, #0x10]\n"
"fmla z25.s, p3/M, z5.s, z10.s\n"
"fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x28, #0x10]\n"
+ "ldr x20, [x9, #0x18]\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z10.s\n"
- "ldr x19, [x28, #0x18]\n"
"fmla z30.s, p3/M, z1.s, z10.s\n"
"fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x15, #0xa8]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "ldr x25, [x16, #0xa8]\n"
"fmla z16.s, p3/M, z3.s, z9.s\n"
"fmla z20.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ldr x12, [x16, #0x98]\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x15, #0xb0]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
"fmla z17.s, p3/M, z4.s, z10.s\n"
"fmla z18.s, p3/M, z3.s, z10.s\n"
"fmla z21.s, p3/M, z1.s, z10.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z2.s, z12.s\n"
"fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x23, [x15, #0xb8]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x24, [x16, #0xb8]\n"
"fmla z27.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ldr x14, [x15, #0xc0]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ldr x15, [x16, #0xc0]\n"
"fmla z16.s, p3/M, z5.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x15, #0xc8]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x14, [x16, #0xc8]\n"
"fmla z17.s, p3/M, z5.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
"fmla z19.s, p3/M, z3.s, z12.s\n"
"fmla z22.s, p3/M, z1.s, z12.s\n"
"fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "ldr x11, [x15, #0xd8]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ldr x12, [x16, #0xd8]\n"
"fmla z28.s, p3/M, z7.s, z11.s\n"
"fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0xd0]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0xd0]\n"
"fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
"fmla z20.s, p3/M, z4.s, z10.s\n"
"fmla z21.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z1.s, z10.s\n"
"fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x15, #0xe0]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
"fmla z30.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
"fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x24, [x15, #0xe8]\n"
+ "ldr x25, [x16, #0xe8]\n"
"fmla z19.s, p3/M, z7.s, z12.s\n"
"fmla z22.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x15, #0xf0]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x26, [x16, #0xf0]\n"
"fmla z16.s, p3/M, z2.s, z10.s\n"
"fmla z17.s, p3/M, z1.s, z10.s\n"
"fmla z18.s, p3/M, z0.s, z10.s\n"
"fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x23, [x15, #0xf8]\n"
+ "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x24, [x16, #0xf8]\n"
"fmla z21.s, p3/M, z6.s, z11.s\n"
"fmla z24.s, p3/M, z4.s, z11.s\n"
"fmla z25.s, p3/M, z3.s, z11.s\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
"fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x14, [x15, #0x100]\n"
+ "ldr x15, [x16, #0x100]\n"
"fmla z30.s, p3/M, z2.s, z11.s\n"
"fmla z17.s, p3/M, z2.s, z12.s\n"
"fmla z18.s, p3/M, z1.s, z12.s\n"
"fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x15, #0x108]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x14, [x16, #0x108]\n"
"fmla z16.s, p3/M, z6.s, z10.s\n"
"fmla z20.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0x110]\n"
+ "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x110]\n"
"fmla z23.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z5.s, z11.s\n"
"fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
"fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x11, [x15, #0x118]\n"
+ "ldr x12, [x16, #0x118]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
"fmla z30.s, p3/M, z3.s, z11.s\n"
"fmla z19.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
"fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
"fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
"fmla z25.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
@@ -311,262 +309,264 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla z29.s, p3/M, z7.s, z10.s\n"
"fmla z30.s, p3/M, z6.s, z10.s\n"
"fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z11.s\n"
"fmla z30.s, p3/M, z4.s, z11.s\n"
"fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldp x14, x13, [x15, #0x0]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldp x15, x14, [x16, #0x0]\n"
"fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
"fmla z17.s, p3/M, z3.s, z10.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
- "ld1w { z9.s }, p0/Z, [x14, x10, LSL #2]\n"
"fmla z19.s, p3/M, z4.s, z11.s\n"
"fmla z29.s, p3/M, z8.s, z12.s\n"
"fmla z30.s, p3/M, z7.s, z12.s\n"
"fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
"fmla z20.s, p3/M, z1.s, z10.s\n"
"fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
- "ldp x12, x11, [x15, #0x10]\n"
+ "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ldp x13, x12, [x16, #0x10]\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
- "incw x9\n"
- "ld1w { z11.s }, p0/Z, [x12, x10, LSL #2]\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x20]\n"
+ "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x20]\n"
"fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x28]\n"
+ "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x28]\n"
"fmla z25.s, p3/M, z6.s, z12.s\n"
"fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x30]\n"
+ "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x30]\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x38]\n"
+ "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x38]\n"
"fmla z28.s, p3/M, z4.s, z12.s\n"
"fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x40]\n"
+ "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x40]\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x48]\n"
+ "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x48]\n"
".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
- "ld1w { z10.s }, p0/Z, [x13, x10, LSL #2]\n"
- "st1w { z22.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x50]\n"
- "ld1w { z12.s }, p0/Z, [x11, x10, LSL #2]\n"
"incw x10\n"
- "st1w { z23.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x58]\n"
- ".inst 0xa040c200 // ld1w { z0.s-z3.s }, pn8.b/Z, [x16]\n"
- "addvl x16, x16, #4\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x60]\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
- ".inst 0xa040c204 // ld1w { z4.s-z7.s }, pn8.b/Z, [x16]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x68]\n"
- "addvl x16, x16, #4\n"
- "cmp x10, %x[n_channels]\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x70]\n"
+ "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x50]\n"
+ "ld1w { z9.s }, p0/Z, [x15, x11, LSL #2]\n"
+ "whilelt p2.s, x10, %x[n_channels]\n"
+ "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x58]\n"
+ "ld1w { z10.s }, p0/Z, [x14, x11, LSL #2]\n"
".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "ld1w { z8.s }, p3/Z, [x16]\n"
- "st1w { z27.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x78]\n"
- "addvl x16, x16, #1\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x19, x27, LSL #2]\n"
+ "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x60]\n"
+ "ld1w { z11.s }, p0/Z, [x13, x11, LSL #2]\n"
+ "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x68]\n"
+ "ld1w { z12.s }, p0/Z, [x12, x11, LSL #2]\n"
+ "incw x11\n"
+ "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x70]\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x78]\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x11, %x[n_channels]\n"
+ "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
"movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
"movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x15, #0x20]\n"
- "incw x27\n"
+ "ldr x27, [x16, #0x20]\n"
+ "incw x28\n"
"movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
"movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "ldr x25, [x15, #0x30]\n"
+ "ldr x26, [x16, #0x30]\n"
"mov p1.b, p2.b\n"
"movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
- "ldr x24, [x15, #0x28]\n"
+ "ldr x25, [x16, #0x28]\n"
"movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
"movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
"fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x15, #0x38]\n"
+ "ldr x24, [x16, #0x38]\n"
"movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
"movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x14, [x15, #0x40]\n"
+ "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x15, [x16, #0x40]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
"movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x13, [x15, #0x48]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x14, [x16, #0x48]\n"
"fmla z22.s, p3/M, z4.s, z12.s\n"
"fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x12, [x15, #0x50]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x50]\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z8.s, z12.s\n"
- "ldr x26, [x15, #0x60]\n"
+ "ldr x27, [x16, #0x60]\n"
"fmla z18.s, p3/M, z7.s, z12.s\n"
"movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x24, [x15, #0x68]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x25, [x16, #0x68]\n"
"fmla z21.s, p3/M, z7.s, z9.s\n"
"fmla z19.s, p3/M, z6.s, z12.s\n"
- "ldr x11, [x15, #0x58]\n"
+ "ldr x12, [x16, #0x58]\n"
"movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
"movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x25, [x15, #0x70]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
"movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
"fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ldr x23, [x15, #0x78]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ldr x24, [x16, #0x78]\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [x15, #0x80]\n"
- "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "ldr x13, [x15, #0x88]\n"
+ "ldr x15, [x16, #0x80]\n"
"fmla z20.s, p3/M, z8.s, z9.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
- "ldr x22, [x28, #0x0]\n"
+ "ldr x14, [x16, #0x88]\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
"fmla z16.s, p3/M, z1.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0x90]\n"
+ "ldr x23, [x9, #0x0]\n"
"fmla z17.s, p3/M, z0.s, z12.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ldr x22, [x9, #0x8]\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
"fmla z18.s, p3/M, z2.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "ldr x11, [x15, #0x98]\n"
+ "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x90]\n"
"fmla z21.s, p3/M, z8.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x15, #0xa0]\n"
+ "ld1w { z11.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x27, [x16, #0xa0]\n"
"fmla z22.s, p3/M, z7.s, z10.s\n"
"fmla z23.s, p3/M, z6.s, z10.s\n"
- "ldr x21, [x28, #0x8]\n"
+ "ldr x21, [x9, #0x10]\n"
"fmla z25.s, p3/M, z5.s, z10.s\n"
"fmla z26.s, p3/M, z4.s, z10.s\n"
- "ldr x20, [x28, #0x10]\n"
+ "ldr x20, [x9, #0x18]\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z10.s\n"
- "ldr x19, [x28, #0x18]\n"
"fmla z30.s, p3/M, z1.s, z10.s\n"
"fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ldr x24, [x15, #0xa8]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "ldr x25, [x16, #0xa8]\n"
"fmla z16.s, p3/M, z3.s, z9.s\n"
"fmla z20.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ldr x12, [x16, #0x98]\n"
"fmla z24.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x15, #0xb0]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
"fmla z17.s, p3/M, z4.s, z10.s\n"
"fmla z18.s, p3/M, z3.s, z10.s\n"
"fmla z21.s, p3/M, z1.s, z10.s\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z2.s, z12.s\n"
"fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x23, [x15, #0xb8]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x24, [x16, #0xb8]\n"
"fmla z27.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
- "ldr x14, [x15, #0xc0]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "ldr x15, [x16, #0xc0]\n"
"fmla z16.s, p3/M, z5.s, z10.s\n"
"fmla z20.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x15, #0xc8]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x14, [x16, #0xc8]\n"
"fmla z17.s, p3/M, z5.s, z12.s\n"
"fmla z18.s, p3/M, z4.s, z12.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
"fmla z19.s, p3/M, z3.s, z12.s\n"
"fmla z22.s, p3/M, z1.s, z12.s\n"
"fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "ldr x11, [x15, #0xd8]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "ldr x12, [x16, #0xd8]\n"
"fmla z28.s, p3/M, z7.s, z11.s\n"
"fmla z29.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0xd0]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0xd0]\n"
"fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
"fmla z20.s, p3/M, z4.s, z10.s\n"
"fmla z21.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z1.s, z10.s\n"
"fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ldr x26, [x15, #0xe0]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
"fmla z30.s, p3/M, z8.s, z11.s\n"
"fmla z31.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x10, LSL #2]\n"
"fmla z27.s, p3/M, z1.s, z12.s\n"
- "ldr x24, [x15, #0xe8]\n"
+ "ldr x25, [x16, #0xe8]\n"
"fmla z19.s, p3/M, z7.s, z12.s\n"
"fmla z22.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z26.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ldr x25, [x15, #0xf0]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ldr x26, [x16, #0xf0]\n"
"fmla z16.s, p3/M, z2.s, z10.s\n"
"fmla z17.s, p3/M, z1.s, z10.s\n"
"fmla z18.s, p3/M, z0.s, z10.s\n"
"fmla z20.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ldr x23, [x15, #0xf8]\n"
+ "ld1w { z10.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "ldr x24, [x16, #0xf8]\n"
"fmla z21.s, p3/M, z6.s, z11.s\n"
"fmla z24.s, p3/M, z4.s, z11.s\n"
"fmla z25.s, p3/M, z3.s, z11.s\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x10, LSL #2]\n"
"fmla z27.s, p3/M, z4.s, z11.s\n"
- "ldr x14, [x15, #0x100]\n"
+ "ldr x15, [x16, #0x100]\n"
"fmla z30.s, p3/M, z2.s, z11.s\n"
"fmla z17.s, p3/M, z2.s, z12.s\n"
"fmla z18.s, p3/M, z1.s, z12.s\n"
"fmla z19.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x9, LSL #2]\n"
- "ldr x13, [x15, #0x108]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ldr x14, [x16, #0x108]\n"
"fmla z16.s, p3/M, z6.s, z10.s\n"
"fmla z20.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"fmla z22.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x9, LSL #2]\n"
- "ldr x12, [x15, #0x110]\n"
+ "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "ldr x13, [x16, #0x110]\n"
"fmla z23.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z5.s, z11.s\n"
"fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
"fmla z27.s, p3/M, z2.s, z12.s\n"
- "ldr x11, [x15, #0x118]\n"
+ "ldr x12, [x16, #0x118]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
"fmla z30.s, p3/M, z3.s, z11.s\n"
"fmla z19.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
"fmla z20.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
"fmla z24.s, p3/M, z3.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x10, LSL #2]\n"
"fmla z25.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z6.s, z11.s\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
@@ -575,18 +575,18 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla z29.s, p3/M, z7.s, z10.s\n"
"fmla z30.s, p3/M, z6.s, z10.s\n"
"fmla z24.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x10, LSL #2]\n"
"fmla z28.s, p3/M, z8.s, z10.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z11.s\n"
"fmla z30.s, p3/M, z4.s, z11.s\n"
"fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
"fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
"fmla z17.s, p3/M, z3.s, z10.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
@@ -594,56 +594,56 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"fmla z29.s, p3/M, z8.s, z12.s\n"
"fmla z30.s, p3/M, z7.s, z12.s\n"
"fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x13, x10, LSL #2]\n"
"fmla z20.s, p3/M, z1.s, z10.s\n"
"fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
".inst 0xc1adc9d0 // fclamp { z16.s-z19.s }, z14.s, z13.s\n"
- "st1w { z16.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x20]\n"
+ "st1w { z16.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x20]\n"
"fmla z24.s, p3/M, z7.s, z12.s\n"
- "st1w { z17.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x28]\n"
+ "st1w { z17.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x28]\n"
"fmla z25.s, p3/M, z6.s, z12.s\n"
"fmla z26.s, p3/M, z8.s, z10.s\n"
- "st1w { z18.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x30]\n"
+ "st1w { z18.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x30]\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
".inst 0xc1adc9d4 // fclamp { z20.s-z23.s }, z14.s, z13.s\n"
- "st1w { z19.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x38]\n"
+ "st1w { z19.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x38]\n"
"fmla z28.s, p3/M, z4.s, z12.s\n"
"fmla z29.s, p3/M, z3.s, z12.s\n"
- "st1w { z20.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x40]\n"
+ "st1w { z20.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x40]\n"
"fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "st1w { z21.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x48]\n"
+ "st1w { z21.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x48]\n"
".inst 0xc1adc9d8 // fclamp { z24.s-z27.s }, z14.s, z13.s\n"
".inst 0xc1adc9dc // fclamp { z28.s-z31.s }, z14.s, z13.s\n"
- "st1w { z22.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x50]\n"
- "st1w { z23.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x58]\n"
- "st1w { z24.s }, p1, [x22, x27, LSL #2]\n"
- "ldr x22, [x28, #0x60]\n"
- "st1w { z25.s }, p1, [x21, x27, LSL #2]\n"
- "ldr x21, [x28, #0x68]\n"
- "st1w { z26.s }, p1, [x20, x27, LSL #2]\n"
- "ldr x20, [x28, #0x70]\n"
- "st1w { z27.s }, p1, [x19, x27, LSL #2]\n"
- "ldr x19, [x28, #0x78]\n"
- "st1w { z28.s }, p1, [x22, x27, LSL #2]\n"
- "st1w { z29.s }, p1, [x21, x27, LSL #2]\n"
- "st1w { z30.s }, p1, [x20, x27, LSL #2]\n"
- "st1w { z31.s }, p1, [x19, x27, LSL #2]\n"
+ "st1w { z22.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x50]\n"
+ "st1w { z23.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x58]\n"
+ "st1w { z24.s }, p1, [x23, x28, LSL #2]\n"
+ "ldr x23, [x9, #0x60]\n"
+ "st1w { z25.s }, p1, [x22, x28, LSL #2]\n"
+ "ldr x22, [x9, #0x68]\n"
+ "st1w { z26.s }, p1, [x21, x28, LSL #2]\n"
+ "ldr x21, [x9, #0x70]\n"
+ "st1w { z27.s }, p1, [x20, x28, LSL #2]\n"
+ "ldr x20, [x9, #0x78]\n"
+ "st1w { z28.s }, p1, [x23, x28, LSL #2]\n"
+ "st1w { z29.s }, p1, [x22, x28, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x28, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x28, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 449df1e29a..f7f67855c1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,102 +94,102 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mov x3, #0x0\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x21, #0x4\n"
+ "mov x22, #0x4\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x19, x2, x20\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "madd x19, x3, x4, x19\n" // offset += tile_j * ld_input_col
- "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x20, LSL #2\n"
- "add x7, x6, x20, LSL #2\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
"add x8, x4, x4\n"
"ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x20, LSL #2\n"
+ "add x16, x7, x21, LSL #2\n"
"add x15, x8, x4\n"
- "add x14, x16, x20, LSL #2\n"
+ "add x14, x16, x21, LSL #2\n"
"add x13, x15, x4\n"
"cbnz x3, 2f\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x20, x19, x3\n"
- "sub x20, x20, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
"lsl x12, %x[n_channels], #0x2\n"
- "mov x19, #0x10\n"
- "and x20, x20, #0x3fffff\n"
- "mul x19, x19, x4\n"
- "orr x12, x12, x20, LSL #22\n"
- "orr x12, x12, x19, LSL #38\n"
- "add x25, x7, x8, LSL #2\n"
- "add x24, x5, x4, LSL #2\n"
- "add x23, x5, x15, LSL #2\n"
- "add x22, x5, x13, LSL #2\n"
- "add x21, x6, x4, LSL #2\n"
- "add x20, x5, x8, LSL #2\n"
- "add x19, x6, x15, LSL #2\n"
- "add x11, x6, x13, LSL #2\n"
- "add x10, x6, x8, LSL #2\n"
- "add x9, x16, x4, LSL #2\n"
- "add x28, x7, x4, LSL #2\n"
- "add x27, x16, x15, LSL #2\n"
- "add x26, x7, x15, LSL #2\n"
- ".inst 0xf8ac4b3a // rprfm pldonce, x25, [x12]\n"
- "add x25, x16, x13, LSL #2\n"
- ".inst 0xf8ac48ba // rprfm pldonce, x5, [x12]\n"
- ".inst 0xf8ac4b1a // rprfm pldonce, x24, [x12]\n"
- "add x24, x7, x13, LSL #2\n"
- ".inst 0xf8ac4afa // rprfm pldonce, x23, [x12]\n"
- "add x23, x14, x4, LSL #2\n"
- ".inst 0xf8ac4ada // rprfm pldonce, x22, [x12]\n"
- "add x22, x16, x8, LSL #2\n"
- ".inst 0xf8ac48da // rprfm pldonce, x6, [x12]\n"
- ".inst 0xf8ac4aba // rprfm pldonce, x21, [x12]\n"
- "add x21, x14, x15, LSL #2\n"
- ".inst 0xf8ac4a9a // rprfm pldonce, x20, [x12]\n"
- "add x20, x14, x8, LSL #2\n"
- ".inst 0xf8ac4a7a // rprfm pldonce, x19, [x12]\n"
- "add x19, x14, x13, LSL #2\n"
- ".inst 0xf8ac497a // rprfm pldonce, x11, [x12]\n"
- ".inst 0xf8ac495a // rprfm pldonce, x10, [x12]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x16, [x12]\n"
- ".inst 0xf8ac48fa // rprfm pldonce, x7, [x12]\n"
- ".inst 0xf8ac493a // rprfm pldonce, x9, [x12]\n"
- ".inst 0xf8ac4b9a // rprfm pldonce, x28, [x12]\n"
- ".inst 0xf8ac4b7a // rprfm pldonce, x27, [x12]\n"
- ".inst 0xf8ac4b5a // rprfm pldonce, x26, [x12]\n"
- ".inst 0xf8ac4b3a // rprfm pldonce, x25, [x12]\n"
- ".inst 0xf8ac49da // rprfm pldonce, x14, [x12]\n"
- ".inst 0xf8ac4b1a // rprfm pldonce, x24, [x12]\n"
- ".inst 0xf8ac4afa // rprfm pldonce, x23, [x12]\n"
- ".inst 0xf8ac4ada // rprfm pldonce, x22, [x12]\n"
- ".inst 0xf8ac4aba // rprfm pldonce, x21, [x12]\n"
- ".inst 0xf8ac4a9a // rprfm pldonce, x20, [x12]\n"
- ".inst 0xf8ac4a7a // rprfm pldonce, x19, [x12]\n"
+ "mov x20, #0x10\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x27, x7, x8, LSL #2\n"
+ "add x26, x5, x4, LSL #2\n"
+ "add x25, x5, x15, LSL #2\n"
+ "add x24, x5, x13, LSL #2\n"
+ "add x23, x6, x4, LSL #2\n"
+ "add x22, x5, x8, LSL #2\n"
+ "add x21, x6, x15, LSL #2\n"
+ "add x20, x6, x13, LSL #2\n"
+ "add x11, x6, x8, LSL #2\n"
+ "add x10, x16, x4, LSL #2\n"
+ "add x9, x7, x4, LSL #2\n"
+ "add x28, x16, x15, LSL #2\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x7, x15, LSL #2\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x16, x13, LSL #2\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x7, x13, LSL #2\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x14, x4, LSL #2\n"
+ ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x16, x8, LSL #2\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x14, x15, LSL #2\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x14, x8, LSL #2\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x14, x13, LSL #2\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_output_row
- "mov x19, #0x2\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x2\n"
"ld1w { z19.s }, p3/Z, [x17]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x20, x3, x24, x20\n" // offset += tile_j * ld_output_col
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x25, x21\n" // offset += tile_j * ld_output_col
"addvl x17, x17, #1\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mul x20, x20, x19\n" // offset *= output_tile_size
- "cntw x22\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "cntw x23\n"
"ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"addvl x17, x17, #4\n"
- "add x23, x23, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
"addvl x17, x17, #4\n"
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x22, %x[n_channels]\n"
- "add x21, x23, x21, LSL #2\n"
+ "cmp x23, %x[n_channels]\n"
+ "add x22, x24, x22, LSL #2\n"
"ld1w { z8.s }, p3/Z, [x17]\n"
- "mov x20, #0x0\n"
- "sub x19, XZR, x22\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
"ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
"ld1w { z10.s }, p2/Z, [x5]\n"
"addvl x17, x17, #1\n"
@@ -203,12 +203,12 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"3:" // Tile loop: Channel loop
"movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
"movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "whilelt p1.s, x22, %x[n_channels]\n"
- "incw x20\n"
+ "whilelt p1.s, x23, %x[n_channels]\n"
+ "incw x21\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x6, x13, LSL #2]\n"
- "incw x22\n"
+ "incw x23\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
"ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
@@ -224,7 +224,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmla z28.s, p3/M, z2.s, z16.s\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"ld1w { z15.s }, p2/Z, [x7]\n"
- "incw x19\n"
+ "incw x20\n"
"movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
"movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
@@ -263,24 +263,24 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ld1w { z11.s }, p2/Z, [x14, x13, LSL #2]\n"
"fmla z30.s, p3/M, z8.s, z15.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "whilelt p2.s, x20, %x[n_channels]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"ld1w { z19.s }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "cmp x22, %x[n_channels]\n"
+ "cmp x23, %x[n_channels]\n"
".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
"addvl x14, x14, #1\n"
- "st1w { z28.s }, p0, [x23]\n"
+ "st1w { z28.s }, p0, [x24]\n"
".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "st1w { z29.s }, p0, [x23, x24, LSL #2]\n"
- "addvl x23, x23, #1\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "addvl x24, x24, #1\n"
"ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "st1w { z30.s }, p0, [x21]\n"
+ "st1w { z30.s }, p0, [x22]\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- "st1w { z31.s }, p0, [x21, x24, LSL #2]\n"
- "addvl x21, x21, #1\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "addvl x22, x22, #1\n"
"ld1w { z11.s }, p1/Z, [x5, x4, LSL #2]\n"
"ld1w { z12.s }, p1/Z, [x5, x15, LSL #2]\n"
"ld1w { z13.s }, p1/Z, [x5, x13, LSL #2]\n"
@@ -302,23 +302,23 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
"ld1w { z11.s }, p2/Z, [x6, x15, LSL #2]\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
"ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
"ld1w { z14.s }, p2/Z, [x16]\n"
- "cmp x3, x19\n"
+ "cmp x3, x20\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
"ld1w { z15.s }, p2/Z, [x7]\n"
- "add x19, x2, #0x1\n"
+ "add x20, x2, #0x1\n"
"movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
"movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
"ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
- "csel x2, x2, x19, LT\n"
+ "csel x2, x2, x20, LT\n"
"fmla z28.s, p3/M, z5.s, z13.s\n"
"fmla z29.s, p3/M, z3.s, z13.s\n"
"ld1w { z13.s }, p2/Z, [x16, x15, LSL #2]\n"
@@ -330,7 +330,7 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmla z30.s, p3/M, z0.s, z15.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
"ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
- "cmp x2, x20\n"
+ "cmp x2, x21\n"
"fmla z30.s, p3/M, z4.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z14.s\n"
"ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
@@ -356,15 +356,15 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmla z30.s, p3/M, z8.s, z15.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "st1w { z28.s }, p0, [x23]\n"
- "st1w { z29.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z30.s }, p0, [x21]\n"
- "st1w { z31.s }, p0, [x21, x24, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 063084eb3c..e2ff9a214e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,228 +87,228 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
"ptrue p3.b\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z19.s }, p3/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "ldp x13, x12, [x19, #0x0]\n"
- "cntw x11\n"
- ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- "ldp x10, x9, [x19, #0x10]\n"
- "mov x28, #0x0\n"
+ "ld1w { z19.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ldp x14, x13, [x20, #0x0]\n"
+ "cntw x12\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "addvl x14, x14, #4\n"
- "cmp x11, %x[n_channels]\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "ldp x28, x27, [x16, #0x0]\n"
+ "addvl x15, x15, #4\n"
+ "cmp x12, %x[n_channels]\n"
"ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x26, x25, [x16, #0x10]\n"
"ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x23, XZR, x11\n"
- "ldp x22, x21, [x15, #0x20]\n"
- "ld1w { z8.s }, p3/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "ldp x20, x19, [x15, #0x30]\n"
- "ld1w { z9.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z15.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x19, x28, LSL #2]\n"
+ "sub x24, XZR, x12\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
"movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
"movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "whilelt p1.s, x11, %x[n_channels]\n"
+ "ldr x28, [x16, #0x40]\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x27, [x16, #0x48]\n"
+ "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
+ "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x25, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x19, [x15, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x22, [x15, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
"movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
"movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ldr x28, [x16, #0x80]\n"
+ "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z13.s\n"
"fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x19, x28, LSL #2]\n"
- "ldr x21, [x15, #0x68]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x22, [x16, #0x68]\n"
"fmla z30.s, p3/M, z3.s, z14.s\n"
"fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x26, [x15, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x27, [x16, #0x88]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
"fmla z30.s, p3/M, z0.s, z15.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x26, x28, LSL #2]\n"
- "ldr x20, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
+ "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x98]\n"
"fmla z30.s, p3/M, z4.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ldr x25, [x15, #0x90]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x26, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x21, [x15, #0xa8]\n"
+ "ldr x22, [x16, #0xa8]\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x22, [x15, #0xa0]\n"
+ "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x23, [x16, #0xa0]\n"
"fmla z30.s, p3/M, z6.s, z15.s\n"
"fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x20, [x15, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
"fmla z30.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x19, [x15, #0xb8]\n"
+ "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla z31.s, p3/M, z7.s, z14.s\n"
"fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x19, x28, LSL #2]\n"
- "ldr x27, [x15, #0xc0]\n"
+ "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x28, [x16, #0xc0]\n"
"fmla z31.s, p3/M, z6.s, z15.s\n"
"fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldp x27, x26, [x15, #0x0]\n"
+ "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "ldp x28, x27, [x16, #0x0]\n"
"fmla z30.s, p3/M, z8.s, z15.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ld1w { z19.s }, p3/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "incw x28\n"
- "ldp x22, x21, [x15, #0x20]\n"
- "ld1w { z9.s }, p1/Z, [x27, x11, LSL #2]\n"
- "incw x23\n"
+ "ldp x26, x25, [x16, #0x10]\n"
+ "ld1w { z19.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "incw x9\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ld1w { z9.s }, p1/Z, [x28, x12, LSL #2]\n"
+ "incw x24\n"
"mov p0.b, p2.b\n"
- "ldp x20, x19, [x15, #0x30]\n"
- "ld1w { z10.s }, p1/Z, [x26, x11, LSL #2]\n"
- "whilelt p2.s, x28, %x[n_channels]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1w { z10.s }, p1/Z, [x27, x12, LSL #2]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "ld1w { z11.s }, p1/Z, [x25, x11, LSL #2]\n"
- "st1w { z28.s }, p0, [x13, x23, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x24, x11, LSL #2]\n"
- "st1w { z29.s }, p0, [x12, x23, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x22, x11, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x23, LSL #2]\n"
- "ld1w { z14.s }, p1/Z, [x21, x11, LSL #2]\n"
- "st1w { z31.s }, p0, [x9, x23, LSL #2]\n"
- "ld1w { z15.s }, p1/Z, [x20, x11, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x19, x11, LSL #2]\n"
- "incw x11\n"
- "cmp x11, %x[n_channels]\n"
- ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- "ld1w { z8.s }, p3/Z, [x14]\n"
- "addvl x14, x14, #1\n"
+ "ld1w { z11.s }, p1/Z, [x26, x12, LSL #2]\n"
+ "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x25, x12, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
+ "ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
+ "incw x12\n"
+ "cmp x12, %x[n_channels]\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
"blt 1b\n"
"2:" // Channel tail
"movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
"movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x27, [x15, #0x40]\n"
- "incw x23\n"
+ "ldr x28, [x16, #0x40]\n"
+ "incw x24\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x26, [x15, #0x48]\n"
- "ld1w { z12.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "ldr x27, [x16, #0x48]\n"
+ "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
- "ldr x25, [x15, #0x50]\n"
+ "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ldr x24, [x15, #0x58]\n"
+ "ld1w { z13.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x25, [x16, #0x58]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ldr x19, [x15, #0x78]\n"
- "ld1w { z14.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1w { z14.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
"fmla z29.s, p3/M, z5.s, z12.s\n"
- "ldr x22, [x15, #0x60]\n"
- "ld1w { z15.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
"movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
"movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x27, [x15, #0x80]\n"
- "ld1w { z12.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ldr x28, [x16, #0x80]\n"
+ "ld1w { z12.s }, p2/Z, [x28, x9, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z13.s\n"
"fmla z29.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x19, x28, LSL #2]\n"
- "ldr x21, [x15, #0x68]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x22, [x16, #0x68]\n"
"fmla z30.s, p3/M, z3.s, z14.s\n"
"fmla z31.s, p3/M, z4.s, z13.s\n"
- "ldr x26, [x15, #0x88]\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x27, [x16, #0x88]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
"fmla z30.s, p3/M, z0.s, z15.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x26, x28, LSL #2]\n"
- "ldr x20, [x15, #0x70]\n"
- "ldr x24, [x15, #0x98]\n"
+ "ld1w { z14.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x98]\n"
"fmla z30.s, p3/M, z4.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z14.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ldr x25, [x15, #0x90]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x26, [x16, #0x90]\n"
"fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x21, [x15, #0xa8]\n"
+ "ldr x22, [x16, #0xa8]\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x25, x28, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x22, [x15, #0xa0]\n"
+ "ld1w { z15.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x23, [x16, #0xa0]\n"
"fmla z30.s, p3/M, z6.s, z15.s\n"
"fmla z31.s, p3/M, z3.s, z16.s\n"
- "ldr x20, [x15, #0xb0]\n"
- "ld1w { z13.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
"fmla z30.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z7.s, z12.s\n"
- "ld1w { z14.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x19, [x15, #0xb8]\n"
+ "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla z31.s, p3/M, z7.s, z14.s\n"
"fmla z30.s, p3/M, z5.s, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x19, x28, LSL #2]\n"
- "ldr x27, [x15, #0xc0]\n"
+ "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x28, [x16, #0xc0]\n"
"fmla z31.s, p3/M, z6.s, z15.s\n"
"fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x28, x9, LSL #2]\n"
"fmla z30.s, p3/M, z8.s, z15.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
"mov p0.b, p2.b\n"
".inst 0xc1b1ca5c // fclamp { z28.s-z31.s }, z18.s, z17.s\n"
- "st1w { z28.s }, p0, [x13, x23, LSL #2]\n"
- "st1w { z29.s }, p0, [x12, x23, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x23, LSL #2]\n"
- "st1w { z31.s }, p0, [x9, x23, LSL #2]\n"
+ "st1w { z28.s }, p0, [x14, x24, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x24, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
index 493166cb19..4d02d29e4e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,316 +69,316 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x6\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x7\n"
- "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z5.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x16\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x17\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z16.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z16.s }, p1/Z, [x19, x15, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x14, #0x1\n"
- "orr x23, x19, %x[ld_in_col], LSL #18\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x24, x20, %x[ld_in_col], LSL #18\n"
"mov z17.d, z16.d\n"
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa1404ac0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x22]\n"
- "orr x23, x16, x23, LSL #20\n"
- "mov x21, #0x6\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z3.s }, p2/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
- "add x20, x17, x7\n"
- ".inst 0xa0404ac6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x22]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x17, x24, LSL #20\n"
+ "mov x22, #0x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z3.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "add x21, x7, x6\n"
+ ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
"mov z18.d, z16.d\n"
"mov z19.d, z16.d\n"
- "ld1w { z9.s }, p2/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
+ "ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ac2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x22]\n"
- "lsl x23, x23, #0x2\n"
- "sub x21, x21, x20\n"
- "ld1w { z1.s }, p2/Z, [x22, #2, MUL VL]\n"
- "madd x19, x19, x17, x13\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "lsl x24, x24, #0x2\n"
+ "sub x22, x22, x21\n"
+ "ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x7, x14\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b74a7c // rprfm pldstrm, x23, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x7, x20, x14\n"
".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
- "mov x9, #0x2\n"
- "ldp x28, x27, [x10], #0x10\n"
+ "mov x10, #0x2\n"
+ "ldp x9, x28, [x11], #0x10\n"
".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x24, x23, [x10], #0x10\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x9\n"
- "csel x19, x20, x9, LT\n"
- "sub x20, x20, x19\n"
- "sub x9, x9, x19\n"
- "cbz x20, 5f\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x25, x24, [x11], #0x10\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- "sub x11, x11, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z12.s }, p1, [x28]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z13.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z13.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
- "st1w { z14.s }, p1, [x24]\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z15.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x17, x7\n"
+ "adds XZR, x7, x6\n"
"bne 10f\n"
- "cbz x9, 8f\n"
- "cmp x9, #0x1\n"
- "sub x14, x14, x9\n"
+ "cbz x10, 8f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
- "ld1w { z28.s }, p1/Z, [x19]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
"7:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p1/Z, [x19]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
"8:" // Unpadded: 0 priming loads
- "cbz x14, 16f\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x13]\n"
- "sub x14, x14, #0x1\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "cmp x14, x11\n"
- "ld1w { z26.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "csel x20, x14, x11, LT\n"
- "ld1w { z27.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x19]\n"
- "sub x11, x11, x20\n"
- "cbz x20, 15f\n"
+ "cbz x15, 16f\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x15, x13\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x15, x13, LT\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 15f\n"
"9:" // Unpadded: Main loop
".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "subs x20, x20, #0x1\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z14.s }, p1, [x24]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z15.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p1/Z, [x19]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
- "cbz x9, 13f\n"
- "cmp x9, #0x1\n"
- "sub x14, x14, x9\n"
+ "cbz x10, 13f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x19]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x19]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
- "ld1w { z27.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- "ld1w { z28.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
"13:" // Padded: 0 priming loads
- "cbz x14, 16f\n"
+ "cbz x15, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z26.s }, p0/Z, [x19]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "sub x14, x14, #0x1\n"
- "sub x11, x11, #0x1\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "cmp x14, x11\n"
- "ld1w { z27.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "cmp x15, x13\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x19]\n"
- "csel x20, x14, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x20\n"
- "cbz x20, 15f\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "csel x21, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 15f\n"
"14:" // Padded: Main loop
".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "subs x20, x20, #0x1\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc1381ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z8.s\n"
".inst 0xc1301ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z0.s\n"
- "ld1w { z23.s }, p0/Z, [x13]\n"
+ "ld1w { z23.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
".inst 0xc1371b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z7.s\n"
".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z26.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z14.s }, p1, [x24]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z14.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x8, x8, #0x1\n"
- "ld1w { z27.s }, p0/Z, [x19]\n"
- "st1w { z15.s }, p1, [x23]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "st1w { z15.s }, p1, [x24]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "ld1w { z28.s }, p0/Z, [x19]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x25, x25, x23, LSL #2\n"
"add x24, x24, x22, LSL #2\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
@@ -390,62 +390,62 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
".inst 0xc1361b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z6.s\n"
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
- "st1w { z13.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z2.s\n"
"add x8, x8, #0x1\n"
- "st1w { z14.s }, p1, [x24]\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z15.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"16:" // Main loop skip tail
- "cbz x11, 18f\n"
+ "cbz x13, 18f\n"
"17:" // Right padding loop
".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc1abc8ac // fclamp { z12.s-z15.s }, z5.s, z11.s\n"
- "st1w { z12.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
- "st1w { z13.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
- "st1w { z14.s }, p1, [x24]\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z15.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 17b\n"
"18:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x23, ALL, MUL #9\n"
+ "str x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
index 289803ce8c..9f6b09ef88 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,578 +69,578 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x9\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x7\n"
- "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x16\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z19.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x17\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z24.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z24.s }, p1/Z, [x19, x15, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z24.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x14, #0x1\n"
- "orr x23, x19, %x[ld_in_col], LSL #18\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x24, x20, %x[ld_in_col], LSL #18\n"
"mov z25.d, z24.d\n"
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa0404ac2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x22]\n"
- "orr x23, x16, x23, LSL #20\n"
- "mov x21, #0x9\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z7.s }, p2/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
- "add x20, x17, x7\n"
- ".inst 0xa0404ac4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x22]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa0404ae2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x17, x24, LSL #20\n"
+ "mov x22, #0x9\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z7.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "add x21, x7, x6\n"
+ ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
- "ld1w { z6.s }, p2/Z, [x22, #2, MUL VL]\n"
- "addvl x22, x22, #3\n"
+ "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ac1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x22]\n"
- "lsl x23, x23, #0x2\n"
- "sub x21, x21, x20\n"
- "ld1w { z8.s }, p2/Z, [x22, #2, MUL VL]\n"
- "madd x19, x19, x17, x13\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa1404ae1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x23]\n"
+ "lsl x24, x24, #0x2\n"
+ "sub x22, x22, x21\n"
+ "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x7, x14\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b74a7c // rprfm pldstrm, x23, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x7, x20, x14\n"
".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
- "mov x21, #0x2\n"
- "ldp x9, x28, [x10], #0x10\n"
+ "mov x22, #0x2\n"
+ "ldp x10, x9, [x11], #0x10\n"
".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "ldp x27, x26, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 5f\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x26, x25, [x11], #0x10\n"
+ "ldp x24, x23, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
- "and x21, x20, #0x1\n"
- "add x20, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "lsr x20, x20, #0x1\n"
- "sub x11, x11, x20\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x13, x13, x21\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z20.s }, p1, [x9]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z21.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z22.s }, p1, [x25]\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x17, x7\n"
+ "adds XZR, x7, x6\n"
"bne 10f\n"
- "cbz x21, 8f\n"
- "cmp x21, #0x1\n"
- "sub x14, x14, x21\n"
+ "cbz x22, 8f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
"7:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "8:" // Unpadded: 0 priming loads
- "cmp x14, #0x2\n"
- "blt 16f\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x13]\n"
- "sub x14, x14, #0x2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
"ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "lsr x19, x14, #0x1\n"
"ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x19, x11\n"
"ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x19, x11, LT\n"
"ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "and x14, x14, #0x1\n"
+ ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
"ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, x21\n"
+ ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- "cbz x21, 15f\n"
+ ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "8:" // Unpadded: 0 priming loads
+ "cmp x15, #0x2\n"
+ "blt 16f\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x2\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "lsr x20, x15, #0x1\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x22, x20, x13, LT\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "and x15, x15, #0x1\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, x22\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "cbz x22, 15f\n"
"9:" // Unpadded: Main loop
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "subs x21, x21, #0x1\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "subs x22, x22, #0x1\n"
".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
"ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x9]\n"
"ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z21.s }, p1, [x28]\n"
"ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "add x28, x28, x26, LSL #2\n"
"ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "st1w { z22.s }, p1, [x25]\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "add x25, x25, x23, LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
- "ld1w { z12.s }, p1/Z, [x13]\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
- "cbz x21, 13f\n"
- "cmp x21, #0x1\n"
- "sub x14, x14, x21\n"
+ "cbz x22, 13f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x19]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1321980 // fmla za.s[x8, 0], { z12.s-z15.s }, z2.s\n"
- "ld1w { z0.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1341ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc13119a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z1.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x19]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "ld1w { z0.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
"13:" // Padded: 0 priming loads
- "cmp x14, #0x2\n"
+ "cmp x15, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x14, x14, #0x2\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x15, x15, #0x2\n"
+ "ld1w { z31.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "sub x11, x11, #0x1\n"
- "lsr x19, x14, #0x1\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ "sub x13, x13, #0x1\n"
+ "lsr x20, x15, #0x1\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "cmp x19, x11\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "csel x21, x19, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "and x14, x14, #0x1\n"
- "sub x11, x11, x21\n"
- "cbz x21, 15f\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "csel x22, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 15f\n"
"14:" // Padded: Main loop
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- "subs x21, x21, #0x1\n"
+ "subs x22, x22, #0x1\n"
".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x9]\n"
+ "ld1w { z31.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z20.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x28]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x20]\n"
- "st1w { z22.s }, p1, [x25]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ "st1w { z22.s }, p1, [x26]\n"
"mov x12, #0x8\n"
".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
"mov x12, #0x0\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "ld1w { z29.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z0.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
- "add x28, x28, x26, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
"add x25, x25, x23, LSL #2\n"
- "add x24, x24, x22, LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
- "ld1w { z29.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z30.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z20.s }, p1, [x9]\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z20.s }, p1, [x10]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x28]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z0.s }, p0/Z, [x19]\n"
- "st1w { z22.s }, p1, [x25]\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ "st1w { z22.s }, p1, [x26]\n"
"mov x12, #0x8\n"
".inst 0xc1331980 // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1351ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z5.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x10, x10, x28, LSL #2\n"
"add x9, x9, x27, LSL #2\n"
- "add x28, x28, x26, LSL #2\n"
- "add x25, x25, x23, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x24, x24, x22, LSL #2\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
"16:" // Main loop skip tail
- "cbz x14, 17f\n" // Skip remainder inputs
+ "cbz x15, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z12.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z30.s }, p0/Z, [x19]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z31.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z0.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1361ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc1321981 // fmla za.s[x8, 1], { z12.s-z15.s }, z2.s\n"
- "sub x11, x11, #0x1\n"
+ "sub x13, x13, #0x1\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0xc1341ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z4.s\n"
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc13119a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z1.s\n"
"add x8, x8, #0x1\n"
- "st1w { z21.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z22.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z23.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x11, 19f\n"
+ "cbz x13, 19f\n"
"18:" // Right padding loop
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc1b3cb94 // fclamp { z20.s-z23.s }, z28.s, z19.s\n"
- "st1w { z20.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "st1w { z21.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z22.s }, p1, [x25]\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 18b\n"
"19:" // End
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x22, ALL, MUL #9\n"
- "str x22, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x23, ALL, MUL #9\n"
+ "str x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
index 0753e2db88..bf12b42ddc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,745 +69,745 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x8\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x5\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z11.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "whilelt p8.s, XZR, x6\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z28.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z28.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
+ "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x16, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
"mov z29.d, z28.d\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "orr x22, x17, x22, LSL #20\n"
- "mov x21, #0x8\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x20, x7, x6\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "orr x23, x7, x23, LSL #20\n"
+ "mov x22, #0x8\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x22, x22, #0x2\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x7, x13\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x14\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x7, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x6, x20, x14\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
- "mov x9, #0x4\n"
- "ldp x28, x27, [x10], #0x10\n"
+ "mov x10, #0x4\n"
+ "ldp x9, x28, [x11], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x24, x23, [x10], #0x10\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x9\n"
- "csel x19, x20, x9, LT\n"
- "sub x20, x20, x19\n"
- "sub x9, x9, x19\n"
- "cbz x20, 5f\n"
+ "ldp x25, x24, [x11], #0x10\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "sub x11, x11, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z24.s }, p1, [x28]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z25.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z25.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
- "st1w { z26.s }, p1, [x24]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x6, x5\n"
"bne 12f\n"
- "cbz x9, 10f\n"
- "cmp x9, #0x1\n"
- "sub x15, x15, x9\n"
+ "cbz x10, 10f\n"
+ "cmp x10, #0x1\n"
+ "sub x16, x16, x10\n"
"beq 9f\n"
- "cmp x9, #0x2\n"
+ "cmp x10, #0x2\n"
"beq 8f\n"
- "cmp x9, #0x3\n"
+ "cmp x10, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"7:" // Unpadded: 3 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"8:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"9:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"10:" // Unpadded: 0 priming loads
- "cbz x15, 20f\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "sub x15, x15, #0x1\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "cmp x15, x11\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "csel x20, x15, x11, LT\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, x20\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "cbz x20, 19f\n"
+ "cbz x16, 20f\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "sub x16, x16, #0x1\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x16, x13\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x16, x13, LT\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, x21\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "cbz x21, 19f\n"
"11:" // Unpadded: Main loop
".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "subs x20, x20, #0x1\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z25.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
- "cbz x9, 17f\n"
- "cmp x9, #0x1\n"
- "sub x15, x15, x9\n"
+ "cbz x10, 17f\n"
+ "cmp x10, #0x1\n"
+ "sub x16, x16, x10\n"
"beq 16f\n"
- "cmp x9, #0x2\n"
+ "cmp x10, #0x2\n"
"beq 15f\n"
- "cmp x9, #0x3\n"
+ "cmp x10, #0x3\n"
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
- "addvl x14, x14, #5\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1341a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13019e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1351a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z5.s\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1351a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13219c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13519c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z5.s\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc13019e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1381a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z8.s\n"
".inst 0xc1301a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z0.s\n"
".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13319c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z3.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13219c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z2.s\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13419c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13019e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z5.s\n"
".inst 0xc1341a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
".inst 0xc1301a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z0.s\n"
".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z5.s\n"
".inst 0xc1341a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"17:" // Padded: 0 priming loads
- "cbz x15, 20f\n"
+ "cbz x16, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x15, x15, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "sub x11, x11, #0x1\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "sub x16, x16, #0x1\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "sub x13, x13, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "cmp x15, x11\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "cmp x16, x13\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "csel x20, x15, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x20\n"
- "cbz x20, 19f\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "csel x21, x16, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 19f\n"
"18:" // Padded: Main loop
".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "subs x20, x20, #0x1\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
- "ld1w { z14.s }, p0/Z, [x13]\n"
+ "ld1w { z14.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
".inst 0xc1381a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z8.s\n"
".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z25.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "st1w { z26.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"bgt 18b\n"
"19:" // Main loop tail
".inst 0xc13a19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc13119e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z1.s\n"
".inst 0xc13319c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z3.s\n"
".inst 0xc13219c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
".inst 0xc13419c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc13719e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z7.s\n"
".inst 0xc13619e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13819e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z8.s\n"
".inst 0xc13019e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z0.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1351a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z5.s\n"
".inst 0xc1341a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
@@ -815,65 +815,65 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc1331a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z3.s\n"
- "st1w { z25.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z25.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
- "st1w { z26.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc1351a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z5.s\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
".inst 0xc1341a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z4.s\n"
"add x8, x8, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"20:" // Main loop skip tail
- "cbz x11, 22f\n"
+ "cbz x13, 22f\n"
"21:" // Right padding loop
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc1abcad8 // fclamp { z24.s-z27.s }, z22.s, z11.s\n"
- "st1w { z24.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
- "st1w { z26.s }, p1, [x24]\n"
+ "st1w { z25.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 21b\n"
"22:" // End
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x14, ALL, MUL #16\n"
- "incb x14, ALL, MUL #9\n"
- "str x14, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x16\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x15, ALL, MUL #16\n"
+ "incb x15, ALL, MUL #9\n"
+ "str x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
index 8920b3b749..755265835d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,1100 +69,1100 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0xb\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0xb\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x6\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x5\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z0.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x7\n"
+ "whilelt p8.s, XZR, x6\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z28.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z28.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
+ "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x16, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
"mov z29.d, z28.d\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "orr x22, x17, x22, LSL #20\n"
- "mov x21, #0xb\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x20, x7, x6\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "orr x23, x7, x23, LSL #20\n"
+ "mov x22, #0xb\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"mov z30.d, z28.d\n"
"mov z31.d, z28.d\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x22, x22, #0x2\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x7, x13\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x14\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x7, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x6, x20, x14\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
- "mov x21, #0x4\n"
- "ldp x9, x28, [x10], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x10, x9, [x11], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x27, x26, [x19], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 5f\n"
+ "ldp x26, x25, [x11], #0x10\n"
+ "ldp x24, x23, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "and x21, x20, #0x1\n"
- "add x20, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "lsr x20, x20, #0x1\n"
- "sub x11, x11, x20\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x13, x13, x21\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z24.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z25.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z25.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z27.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x7, x6\n"
+ "adds XZR, x6, x5\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x16, x16, x22\n"
"beq 9f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 8f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"7:" // Unpadded: 3 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"8:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"9:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "10:" // Unpadded: 0 priming loads
- "cmp x15, #0x2\n"
- "blt 20f\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "sub x15, x15, #0x2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
"ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "lsr x19, x15, #0x1\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x19, x11\n"
"ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "csel x21, x19, x11, LT\n"
"ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "and x15, x15, #0x1\n"
+ ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
"ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, x21\n"
+ ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
"ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
"ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- "cbz x21, 19f\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x16, #0x2\n"
+ "blt 20f\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "sub x16, x16, #0x2\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "lsr x20, x16, #0x1\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x22, x20, x13, LT\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "and x16, x16, #0x1\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, x22\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "cbz x22, 19f\n"
"11:" // Unpadded: Main loop
".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "subs x21, x21, #0x1\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "subs x22, x22, #0x1\n"
".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
+ ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
+ "st1w { z24.s }, p1, [x10]\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z11.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
"ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
"ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x9]\n"
"ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "st1w { z25.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "st1w { z26.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
"ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- "st1w { z27.s }, p1, [x24]\n"
- ".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
"ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z11.s }, p1/Z, [x13]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z25.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
- "cbz x21, 17f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x16, x16, x22\n"
"beq 16f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 15f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1341980 // fmla za.s[x8, 0], { z12.s-z15.s }, z4.s\n"
- "addvl x14, x14, #5\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1351980 // fmla za.s[x8, 0], { z12.s-z15.s }, z5.s\n"
- "addvl x14, x14, #5\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0xc1321aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1311aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1361980 // fmla za.s[x8, 0], { z12.s-z15.s }, z6.s\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13619a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"17:" // Padded: 0 priming loads
- "cmp x15, #0x2\n"
+ "cmp x16, #0x2\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "sub x15, x15, #0x2\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "sub x16, x16, #0x2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "sub x11, x11, #0x1\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "lsr x19, x15, #0x1\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ "lsr x20, x16, #0x1\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "cmp x19, x11\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "csel x21, x19, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "and x15, x15, #0x1\n"
- "sub x11, x11, x21\n"
- "cbz x21, 19f\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "csel x22, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "and x16, x16, #0x1\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 19f\n"
"18:" // Padded: Main loop
".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "subs x21, x21, #0x1\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "subs x22, x22, #0x1\n"
".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"add x8, x8, #0x1\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z24.s }, p1, [x10]\n"
"mov x12, #0x8\n"
".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "add x9, x9, x27, LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "st1w { z25.s }, p1, [x28]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x25, x25, x23, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x24]\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"mov x12, #0x0\n"
".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"bgt 18b\n"
"19:" // Main loop tail
".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"add x8, x8, #0x1\n"
- "addvl x14, x14, #5\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x9]\n"
+ "st1w { z24.s }, p1, [x10]\n"
"mov x12, #0x8\n"
".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- "add x9, x9, x27, LSL #2\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc1351961 // fmla za.s[x8, 1], { z11.s-z14.s }, z5.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "st1w { z25.s }, p1, [x28]\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc13a1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z10.s\n"
".inst 0xc1391aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1351981 // fmla za.s[x8, 1], { z12.s-z15.s }, z5.s\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1371980 // fmla za.s[x8, 0], { z12.s-z15.s }, z7.s\n"
- "add x25, x25, x23, LSL #2\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z27.s }, p1, [x24]\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
".inst 0xc1391ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z9.s\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc13519a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z5.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
"20:" // Main loop skip tail
- "cbz x15, 21f\n" // Skip remainder inputs
+ "cbz x16, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x19]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z14.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
- "ld1w { z24.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1331aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z3.s\n"
- "ld1w { z15.s }, p0/Z, [x19]\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "sub x11, x11, #0x1\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z25.s }, p0/Z, [x19]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc1311aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z1.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
- ".inst 0xa14149c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
".inst 0xc1361981 // fmla za.s[x8, 1], { z12.s-z15.s }, z6.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc1341982 // fmla za.s[x8, 2], { z12.s-z15.s }, z4.s\n"
- "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
".inst 0xc13819a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z8.s\n"
".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xc1311ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z24.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- "st1w { z25.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc13419a2 // fmla za.s[x8, 2], { z13.s-z16.s }, z4.s\n"
"add x8, x8, #0x1\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z27.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"21:" // Tail input: End
- "cbz x11, 23f\n"
+ "cbz x13, 23f\n"
"22:" // Right padding loop
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc1b1c818 // fclamp { z24.s-z27.s }, z0.s, z17.s\n"
- "st1w { z24.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z24.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "st1w { z25.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z27.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z27.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 22b\n"
"23:" // End
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x14, ALL, MUL #16\n"
- "incb x14, ALL, MUL #9\n"
- "str x14, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x16\n"
- "whilelt p1.s, x16, x17\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x15, ALL, MUL #16\n"
+ "incb x15, ALL, MUL #9\n"
+ "str x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
index e6c0cb7910..5570b27644 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,174 +69,174 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x6\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x7\n"
- "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
"ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x16\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x17\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z22.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z22.s }, p1/Z, [x19, x15, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z22.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x19\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x21, x20\n"
"fmov z9.s, #0x0\n"
- "ld1w { z25.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- "incb x19\n"
- "ld1w { z27.s }, p2/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ "incb x20\n"
+ "ld1w { z27.s }, p2/Z, [x21]\n"
".inst 0x648aab29 // bfcvtnt z9.h, p2/M, z25.s\n"
- "incb x20, ALL, MUL #3\n"
- "ld1w { z21.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "incb x21, ALL, MUL #3\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x658aab28 // bfcvt z8.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x20]\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
"fmov z2.s, #0x0\n"
- "incb x20, ALL, MUL #3\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x658aab21 // bfcvt z1.h, p2/M, z25.s\n"
".inst 0x648aab68 // bfcvtnt z8.h, p2/M, z27.s\n"
- "incb x19\n"
- "ld1w { z27.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "incb x20\n"
+ "ld1w { z27.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x648aaaa6 // bfcvtnt z6.h, p2/M, z21.s\n"
".inst 0x658aaaa5 // bfcvt z5.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x648aab22 // bfcvtnt z2.h, p2/M, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x648aab61 // bfcvtnt z1.h, p2/M, z27.s\n"
".inst 0x658aab6c // bfcvt z12.h, p2/M, z27.s\n"
- "ld1w { z27.s }, p2/Z, [x20]\n"
- "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "incb x20, ALL, MUL #3\n"
+ "ld1w { z27.s }, p2/Z, [x21]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "incb x21, ALL, MUL #3\n"
"fmov z7.s, #0x0\n"
".inst 0x658aab24 // bfcvt z4.h, p2/M, z25.s\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
".inst 0x648aaaac // bfcvtnt z12.h, p2/M, z21.s\n"
- "sub x19, x14, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
".inst 0x658aaaaa // bfcvt z10.h, p2/M, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x20]\n"
- "orr x22, x16, x22, LSL #20\n"
- "mov x21, #0x6\n"
- "add x20, x17, x7\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
+ "orr x23, x17, x23, LSL #20\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
"mov z23.d, z22.d\n"
".inst 0x648aab27 // bfcvtnt z7.h, p2/M, z25.s\n"
".inst 0x648aab64 // bfcvtnt z4.h, p2/M, z27.s\n"
".inst 0x648aaaa0 // bfcvtnt z0.h, p2/M, z21.s\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
".inst 0x658aaaa3 // bfcvt z3.h, p2/M, z21.s\n"
- "lsl x22, x22, #0x2\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x17, x13\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x7, x20, x14\n"
".inst 0xc0040ac0 // mova za.d[x8, #0], { z22.d-z23.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040ac1 // mova za.d[x8, #1], { z22.d-z23.d }\n"
- "mov x9, #0x2\n"
- "ldp x28, x27, [x10], #0x10\n"
+ "mov x10, #0x2\n"
+ "ldp x9, x28, [x11], #0x10\n"
".inst 0xc0040ac2 // mova za.d[x8, #2], { z22.d-z23.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc0040ac3 // mova za.d[x8, #3], { z22.d-z23.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "ldp x24, x23, [x10], #0x10\n"
+ "ldp x25, x24, [x11], #0x10\n"
".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x9\n"
- "csel x19, x20, x9, LT\n"
- "sub x20, x20, x19\n"
- "sub x9, x9, x19\n"
- "cbz x20, 5f\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "sub x11, x11, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z24.s }, p1, [x28]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
- "st1w { z25.s }, p1, [x24]\n"
+ "st1w { z25.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x17, x7\n"
+ "adds XZR, x7, x6\n"
"bne 10f\n"
- "cbz x9, 8f\n"
- "cmp x9, #0x1\n"
- "sub x14, x14, x9\n"
+ "cbz x10, 8f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
"7:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
@@ -245,51 +245,51 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
"8:" // Unpadded: 0 priming loads
- "cbz x14, 16f\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "cbz x15, 16f\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "sub x14, x14, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "cmp x14, x11\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "csel x20, x14, x11, LT\n"
+ "cmp x15, x13\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x15, x13, LT\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x11, x11, x20\n"
- "cbz x20, 15f\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 15f\n"
"9:" // Unpadded: Main loop
".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x13]\n"
- "subs x20, x20, #0x1\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x14]\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
@@ -306,155 +306,155 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x28]\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z25.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
- "cbz x9, 13f\n"
- "cmp x9, #0x1\n"
- "sub x14, x14, x9\n"
+ "cbz x10, 13f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12811b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z8.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12911b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z9.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc12511d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z5.h\n"
".inst 0xc12611d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z6.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12111b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12211b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z2.h\n"
".inst 0xc12811b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z8.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc12911b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z9.h\n"
".inst 0xc12a11d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z10.h\n"
".inst 0xc12c11d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z12.h\n"
".inst 0xc12511d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z5.h\n"
".inst 0xc12611d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z6.h\n"
"13:" // Padded: 0 priming loads
- "cbz x14, 16f\n"
+ "cbz x15, 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "sub x14, x14, #0x1\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "sub x15, x15, #0x1\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x11, x11, #0x1\n"
- "cmp x14, x11\n"
- "csel x20, x14, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x20\n"
- "cbz x20, 15f\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "csel x21, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x13]\n"
+ "ld1w { z21.s }, p0/Z, [x14]\n"
".inst 0xc12711b1 // bfdot za.s[x8, 1], { z13.h-z14.h }, z7.h\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0xc12311d0 // bfdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc12011d1 // bfdot za.s[x8, 1], { z14.h-z15.h }, z0.h\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
".inst 0xc12111b2 // bfdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12211b3 // bfdot za.s[x8, 3], { z13.h-z14.h }, z2.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12811b4 // bfdot za.s[x8, 4], { z13.h-z14.h }, z8.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "subs x20, x20, #0x1\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "subs x21, x21, #0x1\n"
".inst 0xc12911b5 // bfdot za.s[x8, 5], { z13.h-z14.h }, z9.h\n"
".inst 0x658aaaad // bfcvt z13.h, p2/M, z21.s\n"
".inst 0x648aaa8d // bfcvtnt z13.h, p2/M, z20.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
@@ -467,16 +467,16 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x28]\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z25.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
".inst 0xc12411b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z4.h\n"
@@ -490,67 +490,67 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc12a11d2 // bfdot za.s[x8, 2], { z14.h-z15.h }, z10.h\n"
- "st1w { z26.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z26.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
".inst 0xc12c11d3 // bfdot za.s[x8, 3], { z14.h-z15.h }, z12.h\n"
- "st1w { z25.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z25.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc12511d4 // bfdot za.s[x8, 4], { z14.h-z15.h }, z5.h\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
".inst 0xc12611d5 // bfdot za.s[x8, 5], { z14.h-z15.h }, z6.h\n"
"add x8, x8, #0x2\n"
".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
"16:" // Main loop skip tail
- "cbz x11, 18f\n"
+ "cbz x13, 18f\n"
"17:" // Right padding loop
".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
".inst 0xc1bdcb98 // fclamp { z24.s-z27.s }, z28.s, z29.s\n"
- "st1w { z24.s }, p1, [x28]\n"
+ "st1w { z24.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z26.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
".inst 0xc0040ac4 // mova za.d[x8, #4], { z22.d-z23.d }\n"
- "st1w { z25.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z25.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0xc0040ac5 // mova za.d[x8, #5], { z22.d-z23.d }\n"
- "st1w { z27.s }, p1, [x23]\n"
- "add x23, x23, x21, LSL #2\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x19, ALL, MUL #9\n"
- "str x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
index 253f0dae0c..e8c9bfeb29 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,691 +69,691 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x9\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x7\n"
- "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
"ld1rw { z27.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x16\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z23.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x17\n"
+ "whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z4.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z4.s }, p1/Z, [x19, x15, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x19\n"
- "ld1w { z19.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- "incb x19\n"
- "ld1w { z24.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x21, x20\n"
+ "ld1w { z19.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ "incb x20\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x658aaa69 // bfcvt z9.h, p2/M, z19.s\n"
- "ld1w { z12.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z12.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x648aab09 // bfcvtnt z9.h, p2/M, z24.s\n"
- "incb x19\n"
- "ld1w { z19.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "incb x20\n"
+ "ld1w { z19.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x658aa983 // bfcvt z3.h, p2/M, z12.s\n"
".inst 0x658aaa62 // bfcvt z2.h, p2/M, z19.s\n"
- "ld1w { z24.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
".inst 0x648aab02 // bfcvtnt z2.h, p2/M, z24.s\n"
- "ld1w { z12.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z12.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x658aa980 // bfcvt z0.h, p2/M, z12.s\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z19.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z19.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
".inst 0x658aaa6a // bfcvt z10.h, p2/M, z19.s\n"
- "sub x19, x14, #0x1\n"
- "ld1w { z24.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
+ "sub x20, x15, #0x1\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
"mov z5.d, z4.d\n"
- "ld1w { z12.s }, p2/Z, [x20]\n"
- "orr x22, x16, x22, LSL #20\n"
- "mov x21, #0x9\n"
+ "ld1w { z12.s }, p2/Z, [x21]\n"
+ "orr x23, x17, x23, LSL #20\n"
+ "mov x22, #0x9\n"
"mov z6.d, z4.d\n"
- "add x20, x17, x7\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
"mov z7.d, z4.d\n"
".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
".inst 0x658aa981 // bfcvt z1.h, p2/M, z12.s\n"
"mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x22, x22, #0x2\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x17, x13\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x19, x13\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x7, x20, x14\n"
".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
- "mov x21, #0x2\n"
- "ldp x9, x28, [x10], #0x10\n"
+ "mov x22, #0x2\n"
+ "ldp x10, x9, [x11], #0x10\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "ldp x27, x26, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 5f\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x26, x25, [x11], #0x10\n"
+ "ldp x24, x23, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "and x21, x20, #0x1\n"
- "add x20, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "lsr x20, x20, #0x1\n"
- "sub x11, x11, x20\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x13, x13, x21\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z28.s }, p1, [x9]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z29.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z30.s }, p1, [x25]\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z31.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x17, x7\n"
+ "adds XZR, x7, x6\n"
"bne 10f\n"
- "cbz x21, 8f\n"
- "cmp x21, #0x1\n"
- "sub x14, x14, x21\n"
+ "cbz x22, 8f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaace // bfcvt z14.h, p2/M, z22.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
"7:" // Unpadded: 1 priming loads
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
- "8:" // Unpadded: 0 priming loads
- "cmp x14, #0x2\n"
- "blt 16f\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "sub x14, x14, #0x2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x11, x11, #0x1\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "lsr x19, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x19, x11\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "csel x21, x19, x11, LT\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "and x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "sub x11, x11, x21\n"
+ ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "cbz x21, 15f\n"
+ ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ "8:" // Unpadded: 0 priming loads
+ "cmp x15, #0x2\n"
+ "blt 16f\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "sub x15, x15, #0x2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "lsr x20, x15, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "csel x22, x20, x13, LT\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "and x15, x15, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "sub x13, x13, x22\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
+ "ld1w { z16.s }, p1/Z, [x14]\n"
".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "subs x21, x21, #0x1\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x9]\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
- "add x9, x9, x27, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "st1w { z29.s }, p1, [x28]\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z30.s }, p1, [x25]\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z31.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
- "cbz x21, 13f\n"
- "cmp x21, #0x1\n"
- "sub x14, x14, x21\n"
+ "cbz x22, 13f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1331190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z3.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
"13:" // Padded: 0 priming loads
- "cmp x14, #0x2\n"
+ "cmp x15, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "sub x14, x14, #0x2\n"
- "sub x11, x11, #0x1\n"
- "lsr x19, x14, #0x1\n"
- "cmp x19, x11\n"
- "csel x20, x19, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "and x14, x14, #0x1\n"
- "sub x11, x11, x20\n"
- "cbz x20, 15f\n"
+ "sub x15, x15, #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
+ "csel x21, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x21\n"
+ "cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
+ "ld1w { z18.s }, p0/Z, [x14]\n"
".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
"mov x12, #0x0\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z28.s }, p1, [x9]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "st1w { z29.s }, p1, [x28]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "st1w { z30.s }, p1, [x25]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "st1w { z30.s }, p1, [x26]\n"
"add x8, x8, #0x1\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa2c // bfcvt z12.h, p2/M, z17.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x658aaa6d // bfcvt z13.h, p2/M, z19.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "subs x20, x20, #0x1\n"
- "add x9, x9, x27, LSL #2\n"
- "st1w { z31.s }, p1, [x24]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "subs x21, x21, #0x1\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "add x28, x28, x26, LSL #2\n"
- "add x25, x25, x23, LSL #2\n"
+ "add x9, x9, x27, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
".inst 0x648aaaab // bfcvtnt z11.h, p2/M, z21.s\n"
".inst 0x648aaa8c // bfcvtnt z12.h, p2/M, z20.s\n"
- "add x24, x24, x22, LSL #2\n"
+ "add x25, x25, x23, LSL #2\n"
".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
"bgt 14b\n"
"15:" // Main loop tail
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x13]\n"
+ "ld1w { z17.s }, p0/Z, [x14]\n"
".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x9]\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z29.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z29.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "st1w { z30.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
- "st1w { z31.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
"16:" // Main loop skip tail
- "cbz x14, 17f\n" // Skip remainder inputs
+ "cbz x15, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x19, x13, %x[ld_in_row], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc13a1170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z10.h\n"
- "sub x11, x11, #0x1\n"
+ "sub x13, x13, #0x1\n"
".inst 0xc1311190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z1.h\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc1331191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z3.h\n"
"add x8, x8, #0x1\n"
- "st1w { z29.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z30.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z31.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
- "cbz x11, 19f\n"
+ "cbz x13, 19f\n"
"18:" // Right padding loop
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x11, x11, #0x1\n"
+ "subs x13, x13, #0x1\n"
".inst 0xc1b7cb7c // fclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1w { z28.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "st1w { z29.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "st1w { z30.s }, p1, [x25]\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z31.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
"bgt 18b\n"
"19:" // End
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x19, ALL, MUL #9\n"
- "str x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x13, x13, x19, LSL #2\n"
- "str x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x10, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x10, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x14, x14, x20, LSL #2\n"
+ "str x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x11, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x11, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x0]\n"
- "ldp x22, x21, [x10, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x11, #0x0]\n"
+ "ldp x23, x22, [x11, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x10, #0x10]\n"
+ "stp x23, x22, [x11, #0x10]\n"
"b.any 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
index 17f2455469..2b3a247686 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,294 +69,258 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x8\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x6\n"
+ "sub x20, x20, x4\n"
"ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z26.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x7\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x6\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z31.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x5\n"
"addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z24.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z24.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z24.s }, p1/Z, [x20, x7, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x19\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x21, x20\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
"fmov z4.s, #0x0\n"
- "incb x19\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "incb x20\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "addvl x23, SP, #30\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "addvl x24, SP, #30\n"
".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "addvl x23, x23, #-6\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
+ "addvl x24, x24, #-6\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x23]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x24]\n"
".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
"fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x19\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z6.h }, p2, [x23, #2, MUL VL]\n"
+ "incb x20\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "mov x20, x19\n"
- "st1h { z7.h }, p2, [x23, #3, MUL VL]\n"
+ "mov x21, x20\n"
+ "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x23, #4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "incb x19\n"
- "st1h { z9.h }, p2, [x23, #5, MUL VL]\n"
- "addvl x23, x23, #-6\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x23]\n"
+ "incb x20\n"
+ "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x24]\n"
"fmov z4.s, #0x0\n"
".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z5.h }, p2, [x23, #1, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- "st1h { z7.h }, p2, [x23, #3, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
+ "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x23, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x23, #5, MUL VL]\n"
- "addvl x23, x23, #-6\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x23]\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x24]\n"
".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
"fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x23, #1, MUL VL]\n"
- "incb x20, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
- "st1h { z6.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "incb x19\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "incb x20\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "st1h { z7.h }, p2, [x23, #3, MUL VL]\n"
+ "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z8.h }, p2, [x23, #4, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "st1h { z9.h }, p2, [x23, #5, MUL VL]\n"
- "addvl x23, x23, #-6\n"
- "st1h { z4.h }, p2, [x23]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ "st1h { z4.h }, p2, [x24]\n"
".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
- "incb x20, ALL, MUL #5\n"
+ "incb x21, ALL, MUL #5\n"
"fmov z4.s, #0x0\n"
- "st1h { z5.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "sub x19, x16, #0x1\n"
- "st1h { z6.h }, p2, [x23, #2, MUL VL]\n"
+ "sub x20, x17, #0x1\n"
+ "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
- "st1h { z7.h }, p2, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
".inst 0x658aa867 // bfcvt z7.h, p2/M, z3.s\n"
- "orr x22, x7, x22, LSL #20\n"
- "mov x21, #0x8\n"
- "st1h { z8.h }, p2, [x23, #4, MUL VL]\n"
+ "orr x23, x6, x23, LSL #20\n"
+ "mov x22, #0x8\n"
+ "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
".inst 0x658aaa28 // bfcvt z8.h, p2/M, z17.s\n"
- "add x20, x5, x6\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "st1h { z9.h }, p2, [x23, #5, MUL VL]\n"
- "addvl x23, x23, #-6\n"
+ "add x21, x5, x4\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
"mov z25.d, z24.d\n"
".inst 0x648aaa44 // bfcvtnt z4.h, p2/M, z18.s\n"
- "st1h { z4.h }, p2, [x23]\n"
+ "st1h { z4.h }, p2, [x24]\n"
".inst 0x648aa965 // bfcvtnt z5.h, p2/M, z11.s\n"
".inst 0x648aa866 // bfcvtnt z6.h, p2/M, z3.s\n"
"mov x11, #0x0\n"
- "st1h { z5.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
".inst 0x648aaa27 // bfcvtnt z7.h, p2/M, z17.s\n"
".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
"mov x8, #0x8\n"
- "st1h { z6.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #2, MUL VL]\n"
".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ldr x14, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x22, x22, #0x2\n"
- "st1h { z7.h }, p2, [x23, #3, MUL VL]\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x5, x15\n"
- "st1h { z8.h }, p2, [x23, #4, MUL VL]\n"
- "st1h { z9.h }, p2, [x23, #5, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "st1h { z7.h }, p2, [x24, #3, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x5, x16\n"
+ "st1h { z8.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z9.h }, p2, [x24, #5, MUL VL]\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x15, x5, x19, x15\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x16, x5, x20, x16\n"
".inst 0xc0046b00 // mova za.d[x11, #0], { z24.d-z25.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0046b01 // mova za.d[x11, #1], { z24.d-z25.d }\n"
- "mov x21, #0x4\n"
- "ldp x13, x0, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x25], #0x10\n"
".inst 0xc0046b02 // mova za.d[x11, #2], { z24.d-z25.d }\n"
- "ldp x10, x9, [x19], #0x10\n"
+ "ldp x0, x10, [x20], #0x10\n"
".inst 0xc0046b03 // mova za.d[x11, #3], { z24.d-z25.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0046b04 // mova za.d[x11, #4], { z24.d-z25.d }\n"
- "ldp x28, x27, [x24], #0x10\n"
+ "ldp x9, x28, [x25], #0x10\n"
".inst 0xc0046b05 // mova za.d[x11, #5], { z24.d-z25.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc0046b06 // mova za.d[x11, #6], { z24.d-z25.d }\n"
".inst 0xc0046b07 // mova za.d[x11, #7], { z24.d-z25.d }\n"
".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- "cbz x20, 5f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 5f\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- "sub x14, x14, x20\n"
+ "sub x15, x15, x21\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z0.s }, p1, [x13]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x0, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z2.s }, p1, [x0]\n"
- "add x0, x0, x9, LSL #2\n"
- "st1w { z1.s }, p1, [x28]\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z3.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x5, x6\n"
+ "adds XZR, x5, x4\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 9f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 8f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x20, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x15]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "addvl x19, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- "7:" // Unpadded: 3 priming loads
- "add x21, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x15]\n"
+ "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
@@ -372,40 +336,30 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
+ ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- "8:" // Unpadded: 2 priming loads
- "add x22, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x15]\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
@@ -425,46 +379,36 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"ld1w { z16.s }, p1/Z, [x22]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- "9:" // Unpadded: 1 priming loads
- "add x23, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x15]\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "addvl x22, SP, #6\n"
+ "addvl x22, SP, #12\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
@@ -487,121 +431,177 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "addvl x23, SP, #6\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "addvl x22, SP, #12\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "addvl x21, SP, #18\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
+ "ld1w { z16.s }, p1/Z, [x24]\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
"10:" // Unpadded: 0 priming loads
".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 20f\n"
- "add x19, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x15]\n"
+ "cbz x17, 20f\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "sub x16, x16, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "sub x14, x14, #0x1\n"
+ "sub x17, x17, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x15, x15, #0x1\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "cmp x16, x14\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "csel x24, x16, x14, LT\n"
+ "cmp x17, x15\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x25, x17, x15, LT\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "sub x14, x14, x24\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "sub x15, x15, x25\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "cbz x24, 19f\n"
+ "cbz x25, 19f\n"
"11:" // Unpadded: Main loop
- "addvl x23, SP, #6\n"
+ "addvl x24, SP, #6\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "addvl x22, SP, #12\n"
- "ld1w { z23.s }, p1/Z, [x15]\n"
+ "addvl x23, SP, #12\n"
+ "ld1w { z23.s }, p1/Z, [x16]\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
- "addvl x21, SP, #18\n"
- "addvl x20, SP, #24\n"
+ ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "add x19, x15, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- "subs x24, x24, #0x1\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
@@ -622,80 +622,35 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x13]\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x0, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z2.s }, p1, [x0]\n"
- "add x0, x0, x9, LSL #2\n"
"add x11, x11, #0x2\n"
".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "st1w { z3.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
- "cbz x21, 17f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 16f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 15f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x15]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x20, x15, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x19, SP, #24\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- "14:" // Padded: 3 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x15]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x21, x15, %x[ld_in_row], LSL #2\n"
+ "add x21, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
@@ -714,7 +669,7 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
@@ -722,35 +677,25 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- "addvl x19, SP, #24\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- "15:" // Padded: 2 priming loads
+ "14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x15]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x22, x15, %x[ld_in_row], LSL #2\n"
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
@@ -769,7 +714,7 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
@@ -777,45 +722,35 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
- ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- "16:" // Padded: 1 priming loads
+ "15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x15]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x23, x15, %x[ld_in_row], LSL #2\n"
+ "add x23, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
@@ -834,7 +769,7 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x22, SP, #6\n"
+ "addvl x22, SP, #12\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
@@ -842,17 +777,16 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "addvl x19, SP, #24\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
@@ -863,141 +797,207 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
+ "16:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "add x24, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "addvl x23, SP, #6\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ "addvl x22, SP, #12\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
+ ".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
+ ".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
"17:" // Padded: 0 priming loads
".inst 0xa0402be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be6 // ld1h { z6.h-z7.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be8 // ld1h { z8.h-z9.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 20f\n"
+ "cbz x17, 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x15]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x19, x15, %x[ld_in_row], LSL #2\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "sub x16, x16, #0x1\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "sub x17, x17, #0x1\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "sub x14, x14, #0x1\n"
- "cmp x16, x14\n"
- "csel x24, x16, x14, LT\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
- "sub x14, x14, x24\n"
- "cbz x24, 19f\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x17, x15\n"
+ "csel x25, x17, x15, LT\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 19f\n"
"18:" // Padded: Main loop
- "addvl x23, SP, #6\n"
+ "addvl x24, SP, #6\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "addvl x22, SP, #12\n"
+ "addvl x23, SP, #12\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa0402b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
- "ld1w { z23.s }, p0/Z, [x15]\n"
- "add x21, x15, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p0/Z, [x16]\n"
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- "subs x24, x24, #0x1\n"
- "add x15, x15, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0412b06 // ld1h { z6.h-z7.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
+ "ld1w { z21.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0422b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
- "ld1w { z18.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
".inst 0x658aaaec // bfcvt z12.h, p2/M, z23.s\n"
@@ -1018,60 +1018,60 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x13]\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x0, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z2.s }, p1, [x0]\n"
- "add x0, x0, x9, LSL #2\n"
"add x11, x11, #0x2\n"
".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "st1w { z3.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
"bgt 18b\n"
"19:" // Main loop tail
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1257190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z5.h\n"
- "addvl x21, SP, #12\n"
+ "addvl x22, SP, #12\n"
".inst 0xc1247191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa0402ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xc1257192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
".inst 0xc12771b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc1257194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc12971d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc1257196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z5.h\n"
".inst 0xc1247197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z4.h\n"
- ".inst 0xa0402a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
".inst 0xc12771b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc12971d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc12771b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z7.h\n"
".inst 0xc12671b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412a66 // ld1h { z6.h-z7.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc12971d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12971d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z9.h\n"
".inst 0xc12871d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z8.h\n"
- ".inst 0xa0422a68 // ld1h { z8.h-z9.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1251190 // bfdot za.s[x8, 0], { z12.h-z13.h }, z5.h\n"
".inst 0xc1241191 // bfdot za.s[x8, 1], { z12.h-z13.h }, z4.h\n"
".inst 0xc12711b0 // bfdot za.s[x8, 0], { z13.h-z14.h }, z7.h\n"
@@ -1082,66 +1082,66 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x13]\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x0, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z2.s }, p1, [x0]\n"
- "add x0, x0, x9, LSL #2\n"
"add x11, x11, #0x2\n"
".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- "st1w { z3.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
"20:" // Main loop skip tail
- "cbz x14, 22f\n"
+ "cbz x15, 22f\n"
"21:" // Right padding loop
".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
- "subs x14, x14, #0x1\n"
+ "subs x15, x15, #0x1\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
".inst 0xc1bfcb40 // fclamp { z0.s-z3.s }, z26.s, z31.s\n"
- "st1w { z0.s }, p1, [x13]\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x0, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z2.s }, p1, [x0]\n"
- "add x0, x0, x9, LSL #2\n"
"add x11, x11, #0x2\n"
".inst 0xc0040b00 // mova za.d[x8, #0], { z24.d-z25.d }\n"
- "st1w { z1.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc0040b01 // mova za.d[x8, #1], { z24.d-z25.d }\n"
- "st1w { z3.s }, p1, [x27]\n"
- "add x27, x27, x25, LSL #2\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
"bgt 21b\n"
"22:" // End
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x19, ALL, MUL #16\n"
- "incb x19, ALL, MUL #9\n"
- "str x19, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x15, x15, x19, LSL #2\n"
- "str x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x7\n"
+ "whilelt p1.s, x7, x6\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x16, x16, x20, LSL #2\n"
+ "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x0", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
index eae8994166..01f689a0b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,292 +69,211 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0xb\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0xb\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x4\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x20, x3\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ld1rw { z30.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z22.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
- "ldr x19, [%x[args], %[offsetof_Args_bias]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z4.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z4.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x19\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x21, x20\n"
+ "ld1w { z31.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x19\n"
- "ld1w { z13.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "incb x20\n"
+ "ld1w { z13.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "addvl x23, SP, #15\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "addvl x24, SP, #15\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "addvl x23, x23, #-3\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
- "st1h { z1.h }, p2, [x23]\n"
+ "addvl x24, x24, #-3\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
+ "st1h { z1.h }, p2, [x24]\n"
".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z31.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x19\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x23, #1, MUL VL]\n"
+ "incb x20\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z13.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x23, #2, MUL VL]\n"
- "addvl x23, x23, #-3\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
- "st1h { z1.h }, p2, [x23]\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "addvl x24, x24, #-3\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
+ "st1h { z1.h }, p2, [x24]\n"
".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "incb x19\n"
+ "ld1w { z31.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "incb x20\n"
".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x23, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "st1h { z2.h }, p2, [x23, #2, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "addvl x23, x23, #-3\n"
+ "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "addvl x24, x24, #-3\n"
".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
- "st1h { z1.h }, p2, [x23]\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
+ "st1h { z1.h }, p2, [x24]\n"
".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z31.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
- "incb x19\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "incb x20\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "st1h { z9.h }, p2, [x23, #1, MUL VL]\n"
- "ld1w { z13.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z2.h }, p2, [x23, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
- "mov x20, x19\n"
- "addvl x23, x23, #-3\n"
- "st1h { z1.h }, p2, [x23]\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "mov x21, x20\n"
+ "addvl x24, x24, #-3\n"
+ "st1h { z1.h }, p2, [x24]\n"
+ "ld1w { z31.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x23, #1, MUL VL]\n"
- "ld1w { z16.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aabe1 // bfcvt z1.h, p2/M, z31.s\n"
".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
- "ld1w { z13.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
+ "ld1w { z13.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
".inst 0x658aa9a9 // bfcvt z9.h, p2/M, z13.s\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- "sub x19, x17, #0x1\n"
- "st1h { z2.h }, p2, [x23, #2, MUL VL]\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
- "orr x22, x19, %x[ld_in_col], LSL #18\n"
- "addvl x23, x23, #-3\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z18.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "sub x20, x7, #0x1\n"
+ "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "addvl x24, x24, #-3\n"
"mov z5.d, z4.d\n"
- "orr x22, x6, x22, LSL #20\n"
- "mov x21, #0xb\n"
+ "orr x23, x5, x23, LSL #20\n"
+ "mov x22, #0xb\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "add x20, x5, x4\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
- "st1h { z1.h }, p2, [x23]\n"
+ "st1h { z1.h }, p2, [x24]\n"
".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "st1h { z9.h }, p2, [x23, #1, MUL VL]\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
"mov x8, #0x0\n"
- "st1h { z2.h }, p2, [x23, #2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x22, x22, #0x2\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x5, x16\n"
+ "st1h { z2.h }, p2, [x24, #2, MUL VL]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
"3:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x2\n"
- "msub x16, x5, x19, x16\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x17, x4, x20, x17\n"
".inst 0xc0040c80 // mova za.d[x8, #0], { z4.d-z7.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040c81 // mova za.d[x8, #1], { z4.d-z7.d }\n"
- "mov x21, #0x4\n"
- "ldp x14, x13, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x25], #0x10\n"
".inst 0xc0040c82 // mova za.d[x8, #2], { z4.d-z7.d }\n"
- "ldp x11, x10, [x19], #0x10\n"
+ "ldp x13, x11, [x20], #0x10\n"
".inst 0xc0040c83 // mova za.d[x8, #3], { z4.d-z7.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "ldp x9, x28, [x24], #0x10\n"
- "ldp x27, x26, [x19], #0x10\n"
- "cbz x20, 5f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 5f\n"
+ "ldp x10, x9, [x25], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "and x21, x20, #0x1\n"
- "add x20, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "lsr x20, x20, #0x1\n"
- "sub x15, x15, x20\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x16, x16, x21\n"
"4:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1w { z24.s }, p1, [x14]\n"
+ "subs x21, x21, #0x1\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ "st1w { z25.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z25.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
- "st1w { z26.s }, p1, [x9]\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x4, x3\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 9f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 8f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
- "add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x16]\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x17]\n"
".inst 0x658aaaea // bfcvt z10.h, p2/M, z23.s\n"
- "addvl x19, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "7:" // Unpadded: 3 priming loads
- "add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "addvl x19, SP, #9\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "8:" // Unpadded: 2 priming loads
- "add x21, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "addvl x20, SP, #6\n"
+ "addvl x20, SP, #12\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "addvl x19, SP, #12\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
@@ -376,31 +295,25 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
"ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "9:" // Unpadded: 1 priming loads
- "add x21, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "addvl x20, SP, #3\n"
+ "addvl x20, SP, #9\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "addvl x19, SP, #9\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
@@ -422,169 +335,256 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "addvl x21, SP, #6\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "addvl x20, SP, #12\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
"ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "addvl x21, SP, #3\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "addvl x20, SP, #9\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ ".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
- "cmp x17, #0x2\n"
+ "cmp x7, #0x2\n"
".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
"ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
- "add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "sub x17, x17, #0x2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "sub x15, x15, #0x1\n"
+ "sub x7, x7, #0x2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x16, x16, #0x1\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "lsr x19, x17, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "cmp x19, x15\n"
+ "lsr x20, x7, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x16\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "csel x25, x19, x15, LT\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x26, x20, x16, LT\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "and x17, x17, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "and x7, x7, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "sub x15, x15, x25\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x16, x16, x26\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "cbz x25, 19f\n"
+ "cbz x26, 19f\n"
"11:" // Unpadded: Main loop
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
- "ld1w { z18.s }, p1/Z, [x16]\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ "ld1w { z18.s }, p1/Z, [x17]\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
- "add x22, x16, %x[ld_in_row], LSL #2\n"
- "addvl x21, SP, #3\n"
+ ".inst 0xa1402b21 // ld1h { z1.h, z9.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row], LSL #2\n"
+ "addvl x22, SP, #3\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z17.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
- "addvl x20, SP, #9\n"
- "add x19, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ "addvl x21, SP, #9\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x25, #2, MUL VL]\n"
".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
- "subs x25, x25, #0x1\n"
- "ld1w { z17.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "subs x26, x26, #0x1\n"
+ "ld1w { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
"add x8, x8, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "st1w { z24.s }, p1, [x14]\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x14, x14, x11, LSL #2\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "add x15, x15, x13, LSL #2\n"
+ ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "st1w { z25.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "st1w { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "st1w { z26.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x16]\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1w { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
- "ld1w { z16.s }, p1/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
@@ -592,434 +592,434 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
- "cbz x21, 17f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 16f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 15f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "addvl x19, SP, #12\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "addvl x19, SP, #9\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "addvl x20, SP, #6\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #6\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "addvl x19, SP, #12\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "addvl x20, SP, #12\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "addvl x20, SP, #3\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #3\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "addvl x19, SP, #9\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "addvl x20, SP, #9\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
"17:" // Padded: 0 priming loads
- "cmp x17, #0x2\n"
+ "cmp x7, #0x2\n"
".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
"ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x19, x16, %x[ld_in_row], LSL #2\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "sub x17, x17, #0x2\n"
- "sub x15, x15, #0x1\n"
- "lsr x19, x17, #0x1\n"
- "cmp x19, x15\n"
- "csel x23, x19, x15, LT\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "and x17, x17, #0x1\n"
- "sub x15, x15, x23\n"
- "cbz x23, 19f\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "csel x24, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x24\n"
+ "cbz x24, 19f\n"
"18:" // Padded: Main loop
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "addvl x22, SP, #6\n"
- "addvl x20, SP, #12\n"
+ "addvl x23, SP, #6\n"
+ "addvl x21, SP, #12\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
- "add x19, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "addvl x21, SP, #3\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x22, SP, #3\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "addvl x20, SP, #9\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "addvl x21, SP, #9\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
- "add x19, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z21.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
"mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "subs x23, x23, #0x1\n"
- "ld1w { z19.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "subs x24, x24, #0x1\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x14]\n"
+ "st1w { z24.s }, p1, [x15]\n"
"mov x12, #0x8\n"
- "ld1w { z18.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
- "st1w { z25.s }, p1, [x13]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z25.s }, p1, [x14]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
- "ld1w { z17.s }, p0/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x19]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x15, x15, x13, LSL #2\n"
"add x14, x14, x11, LSL #2\n"
- "add x13, x13, x10, LSL #2\n"
- "st1w { z26.s }, p1, [x9]\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
".inst 0x648aaaaa // bfcvtnt z10.h, p2/M, z21.s\n"
".inst 0x648aaa8b // bfcvtnt z11.h, p2/M, z20.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
"ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
".inst 0x648aaa6c // bfcvtnt z12.h, p2/M, z19.s\n"
".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
@@ -1028,215 +1028,215 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"bgt 18b\n"
"19:" // Main loop tail
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
- "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #3\n"
- "addvl x19, SP, #9\n"
+ ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #3\n"
+ "addvl x20, SP, #9\n"
".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
"mov x12, #0x4\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
- "ld1w { z18.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa4c // bfcvtnt z12.h, p2/M, z18.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- "st1w { z24.s }, p1, [x14]\n"
- "add x14, x14, x11, LSL #2\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
- "st1w { z25.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
- "st1w { z26.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1w { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
"ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
"20:" // Main loop skip tail
- "cbz x17, 21f\n" // Skip remainder inputs
+ "cbz x7, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
"mov x12, #0x4\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0d // bfcvtnt z13.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x648aaa0e // bfcvtnt z14.h, p2/M, z16.s\n"
".inst 0xc1311150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
- "addvl x20, SP, #6\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "addvl x21, SP, #6\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
".inst 0xc1391170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x19, SP, #12\n"
+ "addvl x20, SP, #12\n"
".inst 0xc1311151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "sub x15, x15, #0x1\n"
+ "sub x16, x16, #0x1\n"
".inst 0xc1391171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z9.h\n"
- ".inst 0xa1402a61 // ld1h { z1.h, z9.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
".inst 0xc1311152 // bfdot za.s[x8, 2], { z10.h-z13.h }, z1.h\n"
".inst 0xc1321190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1391172 // bfdot za.s[x8, 2], { z11.h-z14.h }, z9.h\n"
".inst 0xc1321191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z2.h\n"
- "ld1h { z2.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x14]\n"
- "add x14, x14, x11, LSL #2\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
".inst 0xc1321192 // bfdot za.s[x8, 2], { z12.h-z15.h }, z2.h\n"
"add x8, x8, #0x1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
- "st1w { z26.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
+ "st1w { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
"21:" // Tail input: End
- "cbz x15, 23f\n"
+ "cbz x16, 23f\n"
"22:" // Right padding loop
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "subs x15, x15, #0x1\n"
+ "subs x16, x16, #0x1\n"
".inst 0xc1b6cbd8 // fclamp { z24.s-z27.s }, z30.s, z22.s\n"
- "st1w { z24.s }, p1, [x14]\n"
- "add x14, x14, x11, LSL #2\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
".inst 0xc0040c84 // mova za.d[x8, #4], { z4.d-z7.d }\n"
- "st1w { z25.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
- "st1w { z26.s }, p1, [x9]\n"
+ "st1w { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z27.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
"bgt 22b\n"
"23:" // End
- "ldr x19, [%x[args], %[offsetof_Args_weights]]\n"
- "incb x19, ALL, MUL #16\n"
- "incb x19, ALL, MUL #9\n"
- "str x19, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x19, LSL #2\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x17, x17, x20, LSL #2\n"
+ "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
"add x22, x22, x20, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 7fee92ba29..6c42c76683 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x6\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z24.h, p2/M, z24.h\n"
@@ -90,377 +90,377 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z8.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z20.h, #0x0\n"
"sub z27.h, z27.h, z21.h\n"
- "incw x21\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "incw x22\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z23.h, z23.h, z21.h\n"
"trn1 z0.h, z20.h, z27.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "addvl x20, SP, #12\n"
- "incw x21\n"
- "addvl x20, x20, #-4\n"
- "mov x19, x21\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "addvl x20, x20, #-4\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z21.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z20.h, z27.h\n"
"trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "addvl x20, x20, #-4\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z10.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z11.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x6\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046900 // mova za.d[x11, #0], { z8.d-z9.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046901 // mova za.d[x11, #1], { z8.d-z9.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046902 // mova za.d[x11, #2], { z8.d-z9.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- ".inst 0xc0046903 // mova za.d[x11, #3], { z8.d-z9.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "ldp x26, x25, [x24], #0x10\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "sub x13, x13, x20\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #8\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- "9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x14]\n"
- "addvl x20, SP, #4\n"
+ "addvl x20, SP, #8\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"add z13.h, z13.h, z24.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #8\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
"add z14.h, z14.h, z24.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z17.h, z16.h\n"
+ "add z13.h, z13.h, z24.h\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z17.h, z16.h\n"
+ "add z14.h, z14.h, z24.h\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "trn1 z15.h, z17.h, z16.h\n"
+ "add z15.h, z15.h, z24.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
"add z13.h, z13.h, z24.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z14.h, z14.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x21, SP, #4\n"
- "addvl x20, SP, #8\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
"ld1sb { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- "ld1sb { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "ld1sb { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
"trn1 z13.h, z21.h, z20.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"add z13.h, z13.h, z24.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add z15.h, z15.h, z24.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #8\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #4\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"15:" // Padded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
@@ -469,192 +469,192 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
"trn1 z15.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
+ "csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
"add z21.h, p0/M, z21.h, z24.h\n"
- "add x21, x14, %x[ld_in_row]\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1sb { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
"add z20.h, p0/M, z20.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x20, SP, #4\n"
+ "addvl x21, SP, #4\n"
"add z18.h, p0/M, z18.h, z24.h\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "subs x22, x22, #0x1\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x10]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
"trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x20, SP, #4\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x26]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 19b\n"
"20:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_2rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_2rows_dot_za/generic.cpp
deleted file mode 100644
index a9538acf88..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_2rows_dot_za/generic.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
-#include <algorithm>
-#include <cstddef>
-#include "arm_gemm.hpp"
-
-using arm_gemm::Requantize32;
-
-namespace arm_conv {
-namespace depthwise {
-
-void sme2_s8q_planar_3x3_s2_2rows_dot_za_impl(
- const int8_t *inptr,
- size_t ld_in_row,
- size_t ld_in_col,
- unsigned int pad_top,
- unsigned int valid_input_rows,
- unsigned int pad_left,
- unsigned int valid_input_cols,
- const int8_t *weights,
- int8_t **outptrs,
- const size_t *outlds,
- unsigned int output_cols,
- unsigned int start_channel,
- unsigned int valid_channels,
- const arm_gemm::Requantize32 &qp
-)
-{
- struct Args
- {
- const int8_t *inptr;
- long unsigned int pad_top, pad_bottom, pad_left;
- const int8_t *weights;
- long unsigned int input_cols, output_cols;
- int8_t **outptrs;
- const size_t *ld_out_cols;
- long unsigned int n, n_channels;
- };
-
- Args args = { inptr, pad_top, 5u - std::min(5u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, start_channel, valid_channels };
-
- __asm__ __volatile__(
- "ldr x11, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x5\n"
- ".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x11\n"
- "ldr x10, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ptrue p0.b\n"
- "mov z12.s, #0x0\n"
- "ldr x22, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p5.s, XZR, x22\n"
- "whilelt p9.s, XZR, x19\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "whilelt p8.s, XZR, x10\n"
- "eor p8.b, p0/Z, p8.b, p9.b\n"
- "ldr x21, [%x[args], %[offsetof_Args_n]]\n"
- "cbz x19, 1f\n"
- "ld1w { z12.s }, p5/Z, [x19, x21, LSL #2]\n"
- "1:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "mov z0.h, #0x0\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "mov z13.d, z12.d\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "ld1rh { z28.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z27.h, z27.h, z28.h\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z21.h, z21.h, z28.h\n"
- "trn1 z8.h, z27.h, z21.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z7.h, z16.h, z20.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "sub z24.h, z24.h, z28.h\n"
- "trn1 z6.h, z17.h, z0.h\n"
- "sub z19.h, z19.h, z28.h\n"
- "trn1 z5.h, z24.h, z0.h\n"
- "trn1 z4.h, z22.h, z18.h\n"
- "trn1 z3.h, z19.h, z0.h\n"
- "ld1rh { z21.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "ld1rw { z2.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z1.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "cbz x19, 2f\n"
- "ld1w { z1.s }, p5/Z, [x19, x21, LSL #2]\n"
- "2:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "ld1rw { z0.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "cbz x19, 3f\n"
- "ld1w { z0.s }, p5/Z, [x19, x21, LSL #2]\n"
- "3:" // Load right_shift: End
- "ldr x28, [%x[args], %[offsetof_Args_input_cols]]\n"
- "orr x21, x28, %x[ld_in_col], LSL #16\n"
- "orr x21, x22, x21, LSL #22\n"
- "ld1rw { z20.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ldr x27, [%x[args], %[offsetof_Args_inptr]]\n"
- "mov x20, #0x5\n"
- "add x19, x10, x11\n"
- "ld1rw { z19.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov x9, #0x0\n"
- "ldr x26, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x21, x21, #0x0\n"
- "sub x20, x20, x19\n"
- "mov x19, x27\n"
- "4:" // Issue prefetches
- "subs x20, x20, #0x1\n"
- ".inst 0xf8b54a7c // rprfm pldstrm, x21, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
- "bgt 4b\n"
- "ldr x21, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x27, x10, x19, x27\n"
- ".inst 0xc0042980 // mova za.d[x9, #0], { z12.d-z13.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0042981 // mova za.d[x9, #1], { z12.d-z13.d }\n"
- "mov x25, #0x2\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 6f\n"
- "cmp x20, x25\n"
- "csel x19, x20, x25, LT\n"
- "sub x20, x20, x19\n"
- "sub x25, x25, x19\n"
- "cbz x20, 6f\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "and x25, x20, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "sub x26, x26, x20\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "5:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 5b\n"
- "6:" // Left padding: End
- "adds XZR, x10, x11\n"
- "bne 11f\n"
- "cbz x25, 9f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 8f\n"
- "7:" // Unpadded: 2 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1sb { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1sb { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1sb { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "8:" // Unpadded: 1 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1sb { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1sb { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1sb { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "9:" // Unpadded: 0 priming loads
- "add x20, x27, %x[ld_in_row]\n"
- "ld1sb { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "ld1sb { z18.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "sub x26, x26, #0x1\n"
- "ld1sb { z15.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "lsr x19, x28, #0x1\n"
- "ld1sb { z17.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "cmp x19, x26\n"
- "ld1sb { z16.s }, p5/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
- "csel x20, x19, x26, LT\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "10:" // Unpadded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1sb { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "sub z18.h, z18.h, z21.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1sb { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1sb { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1sb { z14.s }, p5/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "ld1sb { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "ld1sb { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub z18.h, z18.h, z21.h\n"
- "ld1sb { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "sub z17.h, z17.h, z21.h\n"
- "ld1sb { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "add x27, x27, %x[ld_in_col]\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "bgt 10b\n"
- "b 16f\n"
- "11:" // Padded
- "cbz x25, 14f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 13f\n"
- "12:" // Padded: 2 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "13:" // Padded: 1 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "14:" // Padded: 0 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "sub x26, x26, #0x1\n"
- "lsr x19, x28, #0x1\n"
- "mov z16.d, z16.d\n"
- "cmp x19, x26\n"
- "csel x20, x19, x26, LT\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "15:" // Padded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "add x19, x27, %x[ld_in_row]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "bgt 15b\n"
- "16:" // Main loop tail
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "mov x12, #0x4\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "add x9, x9, #0x1\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "cbz x28, 17f\n" // Skip remainder inputs
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "sub x26, x26, #0x1\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "17:" // Tail input: End
- "cbz x26, 19f\n"
- "18:" // Right padding loop
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 18b\n"
- "19:" // End
- ".inst 0xd503467f // SMSTOP\n"
- :
- : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_n] "I" (offsetof(Args, n)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index fd35da4010..03575aa799 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x9\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z5.h, p2/M, z5.h\n"
@@ -90,317 +90,317 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z0.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z0.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z24.h, z24.h, z13.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z24.h, z24.h, z13.h\n"
- "addvl x20, SP, #6\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
- "incw x21\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "incw x22\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z24.h, z24.h, z13.h\n"
"sub z25.h, z25.h, z13.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z10.h }, p2, [x21]\n"
"mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x20, x20, #-2\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z7.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x9\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046c00 // mova za.d[x11, #0], { z0.d-z3.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046c01 // mova za.d[x11, #1], { z0.d-z3.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x24], #0x10\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "sub x13, x13, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1sb { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #4\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"9:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1sb { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #2\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1sb { z12.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x15, #0x1\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
"add z12.h, z12.h, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "cmp x19, x13\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x22, x19, x13, LT\n"
+ "cmp x20, x13\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
"add z13.h, z13.h, z5.h\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- "addvl x20, SP, #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1sb { z12.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z12.h, z12.h, z5.h\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1sb { z14.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x11, x11, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"ld1sb { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1sb { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1sb { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "ld1sb { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "ld1sb { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
- "ld1sb { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
+ "add x27, x27, x25\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1sb { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z13.h, z13.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
@@ -411,108 +411,108 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #4\n"
+ "addvl x20, SP, #4\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #2\n"
+ "addvl x20, SP, #2\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
@@ -521,357 +521,357 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "lsr x19, x15, #0x1\n"
- "cmp x19, x13\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
"mov z16.d, z16.d\n"
- "csel x21, x19, x13, LT\n"
+ "csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x21\n"
- "cbz x21, 17f\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z19.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
"mov x12, #0x8\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x19, SP, #2\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"mov z16.d, z16.d\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x21, x21, #0x1\n"
+ "subs x22, x22, #0x1\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z31.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x19, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #2\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"add x14, x14, %x[ld_in_col]\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #4\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 20b\n"
"21:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 722fd5eaad..d366b3c8d5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x8\n"
+ "mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x5\n"
+ "sub x20, x20, x4\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z25.h, p2/M, z25.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,298 +90,262 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z6.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z6.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x23\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "mov x22, x23\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z2.h, #0x0\n"
"sub z18.h, z18.h, z12.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z17.h, z17.h, z12.h\n"
"trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z16.h, z16.h, z12.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"trn1 z10.h, z16.h, z15.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
- "addvl x20, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "addvl x21, SP, #30\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"incw x23\n"
"sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z2.h, z18.h\n"
"incw x23\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z21.h, z21.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z21.h, z21.h, z12.h\n"
"sub z16.h, z16.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
"sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"mov z7.d, z6.d\n"
"trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
"trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "addvl x20, x20, #-6\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "addvl x21, x21, #-6\n"
"trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x16, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x7, x22, LSL #22\n"
- "mov x21, #0x8\n"
- "add x20, x6, x5\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x14, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x17, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x4\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x6, x15\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x15, x6, x19, x15\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
- "mov x21, #0x4\n"
- "ldp x13, x4, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x25], #0x10\n"
".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x10, x9, [x19], #0x10\n"
+ "ldp x3, x10, [x20], #0x10\n"
".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x28, x27, [x24], #0x10\n"
+ "ldp x9, x28, [x25], #0x10\n"
".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "sub x14, x14, x20\n"
+ "sub x15, x15, x21\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x5\n"
+ "adds XZR, x6, x4\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x15, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x15]\n"
- "addvl x19, SP, #24\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z29.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x21, x15, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x15]\n"
- "addvl x20, SP, #18\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
+ "ld1sb { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z16.h, z29.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
@@ -390,47 +354,37 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1sb { z16.s }, p1/Z, [x21]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ "add z30.h, z30.h, z25.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x22, x15, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x15]\n"
- "addvl x21, SP, #12\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
@@ -441,54 +395,44 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"add z30.h, z30.h, z25.h\n"
".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x23, x15, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x15]\n"
- "addvl x22, SP, #6\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
"ld1sb { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1sb { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1sb { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1sb { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x23]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
@@ -504,121 +448,177 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z27.h, z27.h, z25.h\n"
+ "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z17.h, z16.h\n"
+ "add z28.h, z28.h, z25.h\n"
+ "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z17.h, z16.h\n"
+ "add z29.h, z29.h, z25.h\n"
+ "ld1sb { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "add z30.h, z30.h, z25.h\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"12:" // Unpadded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x15]\n"
- "sub x16, x16, #0x1\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cbz x17, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x1\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
- "sub x14, x14, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "cmp x16, x14\n"
+ "sub x15, x15, #0x1\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x17, x15\n"
"add z27.h, z27.h, z25.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x25, x17, x15, LT\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z28.h, z28.h, z25.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub x14, x14, x24\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
"add z30.h, z30.h, z25.h\n"
- "cbz x24, 21f\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- "addvl x23, SP, #6\n"
+ "addvl x24, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #12\n"
- "ld1sb { z23.s }, p1/Z, [x15]\n"
+ "addvl x23, SP, #12\n"
+ "ld1sb { z23.s }, p1/Z, [x16]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "addvl x21, SP, #18\n"
- "addvl x20, SP, #24\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1sb { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "subs x24, x24, #0x1\n"
- "add x15, x15, %x[ld_in_col]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
"trn1 z27.h, z23.h, z22.h\n"
@@ -645,407 +645,407 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
"add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z14.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x15]\n"
+ "ld1sb { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x20, x15, %x[ld_in_row]\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x15]\n"
+ "ld1sb { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #24\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x15]\n"
+ "ld1sb { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x15]\n"
+ "ld1sb { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
- "addvl x22, SP, #6\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "addvl x19, SP, #24\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"19:" // Padded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
+ "cbz x17, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x15]\n"
+ "ld1sb { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "sub x14, x14, #0x1\n"
- "cmp x16, x14\n"
+ "sub x17, x17, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x17, x15\n"
"trn1 z29.h, z19.h, z18.h\n"
"trn1 z30.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "add x15, x15, %x[ld_in_col]\n"
- "sub x14, x14, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x17, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x15]\n"
+ "ld1sb { z23.s }, p0/Z, [x16]\n"
"add z23.h, p0/M, z23.h, z25.h\n"
- "add x23, x15, %x[ld_in_row]\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x23]\n"
+ "ld1sb { z22.s }, p0/Z, [x24]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #12\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
"add z22.h, p0/M, z22.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
- "ld1sb { z21.s }, p0/Z, [x23]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z21.s }, p0/Z, [x24]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
"add z21.h, p0/M, z21.h, z25.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z20.s }, p0/Z, [x23]\n"
+ "ld1sb { z20.s }, p0/Z, [x24]\n"
"add z20.h, p0/M, z20.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x24, x24, #0x1\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z19.s }, p0/Z, [x23]\n"
+ "ld1sb { z19.s }, p0/Z, [x24]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1sb { z18.s }, p0/Z, [x23]\n"
+ "ld1sb { z18.s }, p0/Z, [x24]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1sb { z17.s }, p0/Z, [x23]\n"
+ "ld1sb { z17.s }, p0/Z, [x24]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "ld1sb { z16.s }, p0/Z, [x24]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
@@ -1069,56 +1069,56 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 20b\n"
"21:" // Main loop tail
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x21, SP, #12\n"
+ "addvl x22, SP, #12\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
@@ -1135,20 +1135,20 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"22:" // Main loop skip tail
- "cbz x14, 24f\n"
+ "cbz x15, 24f\n"
"23:" // Right padding loop
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
- "subs x14, x14, #0x1\n"
+ "subs x15, x15, #0x1\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
"add x11, x11, #0x2\n"
@@ -1157,44 +1157,44 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 23b\n"
"24:" // End
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
"incw x23, ALL, MUL #16\n"
"incw x23, ALL, MUL #9\n"
"str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x15, x15, x19\n"
- "str x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x5\n"
+ "whilelt p1.s, x5, x7\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x16, x16, x20\n"
+ "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 81829b5f4e..3e8510392f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0xb\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x4\n"
+ "sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z9.h, p2/M, z9.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,310 +90,227 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z28.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z28.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z12.h, z12.h, z18.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "addvl x20, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, SP, #15\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
"sub z16.h, z16.h, z18.h\n"
- "incw x21\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, x21, #-3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
- "incw x21\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
"sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "incw x21\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"sub z16.h, z16.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z17.h, z17.h, z18.h\n"
"sub z16.h, z16.h, z18.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x20, x20, #-3\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z2.h }, p2, [x21]\n"
"mov z29.d, z28.d\n"
"mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"mov z31.d, z28.d\n"
"trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "addvl x20, x20, #-3\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
"trn1 z10.h, z24.h, z17.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x17, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x6, x22, LSL #22\n"
- "mov x21, #0xb\n"
- "add x20, x5, x4\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x8, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x5, x16\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x16, x5, x19, x16\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
- "mov x21, #0x4\n"
- "ldp x14, x13, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x25], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x11, x10, [x19], #0x10\n"
+ "ldp x13, x11, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x9, x28, [x24], #0x10\n"
- "ldp x27, x26, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "ldp x10, x9, [x25], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "sub x15, x15, x20\n"
+ "sub x16, x16, x21\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x4, x3\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #12\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #9\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #6\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
"ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #12\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1sb { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -407,40 +324,34 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #3\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
"ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #9\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1sb { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1sb { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -454,127 +365,100 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1sb { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "12:" // Unpadded: 0 priming loads
- "cmp x17, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "blt 22f\n"
- "add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x2\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z21.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x17, #0x1\n"
"add z11.h, z11.h, z9.h\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1sb { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "cmp x19, x15\n"
- "ld1sb { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x25, x19, x15, LT\n"
"add z12.h, z12.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "and x17, x17, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- "sub x15, x15, x25\n"
- "cbz x25, 21f\n"
- "13:" // Unpadded: Main loop
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- "add x22, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "addvl x20, SP, #9\n"
- "subs x25, x25, #0x1\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "add x19, x16, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
"ld1sb { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
"add z11.h, z11.h, z9.h\n"
"ld1sb { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "addvl x20, SP, #9\n"
"ld1sb { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1sb { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1sb { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
"ld1sb { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
"ld1sb { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"ld1sb { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
@@ -583,50 +467,166 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"add z16.h, z16.h, z9.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "ld1sb { z11.s }, p1/Z, [x16]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "ld1sb { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "ld1sb { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "ld1sb { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "sub x16, x16, #0x1\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x28, x28, x26\n"
- "ld1sb { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cmp x20, x16\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z15.h, z15.h, z9.h\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ "add z16.h, z16.h, z9.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1sb { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1sb { z12.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1sb { z20.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1sb { z13.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1sb { z19.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1sb { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ "ld1sb { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1sb { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, z15.h, z9.h\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1sb { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add x9, x9, x27\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z11.h, z11.h, z9.h\n"
- "ld1sb { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z12.h, z12.h, z9.h\n"
- "ld1sb { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z13.h, z13.h, z9.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1sb { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
- "ld1sb { z16.s }, p1/Z, [x19]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z9.h\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
@@ -634,717 +634,717 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #12\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x20]\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "ld1sb { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #9\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #12\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #3\n"
+ "addvl x21, SP, #3\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #9\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x17, #0x2\n"
+ "cmp x7, #0x2\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
- "sub x17, x17, #0x2\n"
- "sub x15, x15, #0x1\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "lsr x19, x17, #0x1\n"
- "cmp x19, x15\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "csel x24, x19, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "and x17, x17, #0x1\n"
- "sub x15, x15, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x19, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #9\n"
- "subs x24, x24, #0x1\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"mov x12, #0x4\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z17.h, p0/M, z17.h, z9.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x21, x16, %x[ld_in_row]\n"
- "addvl x20, SP, #3\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x19, SP, #9\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "ld1sb { z21.s }, p0/Z, [x22]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1sb { z12.s }, p0/Z, [x21]\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "ld1sb { z20.s }, p0/Z, [x22]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1sb { z13.s }, p0/Z, [x21]\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x21]\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "ld1sb { z15.s }, p0/Z, [x22]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"22:" // Main loop skip tail
- "cbz x17, 23f\n" // Skip remainder inputs
+ "cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "ld1sb { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x19]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z12.s }, p0/Z, [x19]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x19]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z13.s }, p0/Z, [x19]\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x19]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z14.s }, p0/Z, [x19]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x19]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x19]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x19]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "ld1sb { z16.s }, p0/Z, [x19]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #12\n"
- "sub x15, x15, #0x1\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"23:" // Tail input: End
- "cbz x15, 25f\n"
+ "cbz x16, 25f\n"
"24:" // Right padding loop
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add x8, x8, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "subs x15, x15, #0x1\n"
+ "subs x16, x16, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #16\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x19\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #16\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x17, x17, x20\n"
+ "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index d59879b206..a7ef556840 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x6\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z24.h, p2/M, z24.h\n"
@@ -90,377 +90,377 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z8.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1b { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z20.h, #0x0\n"
"sub z27.h, z27.h, z21.h\n"
- "incw x21\n"
- "ld1b { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "incw x22\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z23.h, z23.h, z21.h\n"
"trn1 z0.h, z20.h, z27.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "addvl x20, SP, #12\n"
- "incw x21\n"
- "addvl x20, x20, #-4\n"
- "mov x19, x21\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1b { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z2.h, z23.h, z16.h\n"
- "ld1b { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "addvl x20, x20, #-4\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z21.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z20.h, z27.h\n"
"trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "addvl x20, x20, #-4\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z10.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z11.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x6\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046900 // mova za.d[x11, #0], { z8.d-z9.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046901 // mova za.d[x11, #1], { z8.d-z9.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046902 // mova za.d[x11, #2], { z8.d-z9.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- ".inst 0xc0046903 // mova za.d[x11, #3], { z8.d-z9.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "ldp x26, x25, [x24], #0x10\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "sub x13, x13, x20\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- "9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
- "addvl x20, SP, #4\n"
+ "addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"add z13.h, z13.h, z24.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #8\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
"add z14.h, z14.h, z24.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z17.h, z16.h\n"
+ "add z13.h, z13.h, z24.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z17.h, z16.h\n"
+ "add z14.h, z14.h, z24.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "trn1 z15.h, z17.h, z16.h\n"
+ "add z15.h, z15.h, z24.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
"add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z14.h, z14.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x21, SP, #4\n"
- "addvl x20, SP, #8\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
"ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
"trn1 z13.h, z21.h, z20.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"add z13.h, z13.h, z24.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add z15.h, z15.h, z24.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #8\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"15:" // Padded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
@@ -469,192 +469,192 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
"trn1 z15.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
+ "csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
"add z21.h, p0/M, z21.h, z24.h\n"
- "add x21, x14, %x[ld_in_row]\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
"add z20.h, p0/M, z20.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x20, SP, #4\n"
+ "addvl x21, SP, #4\n"
"add z18.h, p0/M, z18.h, z24.h\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x10]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
"trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x20, SP, #4\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x26]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 19b\n"
"20:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_2rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_2rows_dot_za/generic.cpp
deleted file mode 100644
index 9a0840cfc4..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_2rows_dot_za/generic.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
-#include <algorithm>
-#include <cstddef>
-#include "arm_gemm.hpp"
-
-using arm_gemm::Requantize32;
-
-namespace arm_conv {
-namespace depthwise {
-
-void sme2_u8q_planar_3x3_s2_2rows_dot_za_impl(
- const uint8_t *inptr,
- size_t ld_in_row,
- size_t ld_in_col,
- unsigned int pad_top,
- unsigned int valid_input_rows,
- unsigned int pad_left,
- unsigned int valid_input_cols,
- const uint8_t *weights,
- uint8_t **outptrs,
- const size_t *outlds,
- unsigned int output_cols,
- unsigned int start_channel,
- unsigned int valid_channels,
- const arm_gemm::Requantize32 &qp
-)
-{
- struct Args
- {
- const uint8_t *inptr;
- long unsigned int pad_top, pad_bottom, pad_left;
- const uint8_t *weights;
- long unsigned int input_cols, output_cols;
- uint8_t **outptrs;
- const size_t *ld_out_cols;
- long unsigned int n, n_channels;
- };
-
- Args args = { inptr, pad_top, 5u - std::min(5u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, start_channel, valid_channels };
-
- __asm__ __volatile__(
- "ldr x11, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x5\n"
- ".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x11\n"
- "ldr x10, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ptrue p0.b\n"
- "mov z12.s, #0x0\n"
- "ldr x22, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p5.s, XZR, x22\n"
- "whilelt p9.s, XZR, x19\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "whilelt p8.s, XZR, x10\n"
- "eor p8.b, p0/Z, p8.b, p9.b\n"
- "ldr x21, [%x[args], %[offsetof_Args_n]]\n"
- "cbz x19, 1f\n"
- "ld1w { z12.s }, p5/Z, [x19, x21, LSL #2]\n"
- "1:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "mov z0.h, #0x0\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "mov z13.d, z12.d\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "ld1rh { z28.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z27.h, z27.h, z28.h\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z21.h, z21.h, z28.h\n"
- "trn1 z8.h, z27.h, z21.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z7.h, z16.h, z20.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "sub z24.h, z24.h, z28.h\n"
- "trn1 z6.h, z17.h, z0.h\n"
- "sub z19.h, z19.h, z28.h\n"
- "trn1 z5.h, z24.h, z0.h\n"
- "trn1 z4.h, z22.h, z18.h\n"
- "trn1 z3.h, z19.h, z0.h\n"
- "ld1rh { z21.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "ld1rw { z2.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z1.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "cbz x19, 2f\n"
- "ld1w { z1.s }, p5/Z, [x19, x21, LSL #2]\n"
- "2:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "ld1rw { z0.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "cbz x19, 3f\n"
- "ld1w { z0.s }, p5/Z, [x19, x21, LSL #2]\n"
- "3:" // Load right_shift: End
- "ldr x28, [%x[args], %[offsetof_Args_input_cols]]\n"
- "orr x21, x28, %x[ld_in_col], LSL #16\n"
- "orr x21, x22, x21, LSL #22\n"
- "ld1rw { z20.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ldr x27, [%x[args], %[offsetof_Args_inptr]]\n"
- "mov x20, #0x5\n"
- "add x19, x10, x11\n"
- "ld1rw { z19.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov x9, #0x0\n"
- "ldr x26, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x21, x21, #0x0\n"
- "sub x20, x20, x19\n"
- "mov x19, x27\n"
- "4:" // Issue prefetches
- "subs x20, x20, #0x1\n"
- ".inst 0xf8b54a7c // rprfm pldstrm, x21, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
- "bgt 4b\n"
- "ldr x21, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x27, x10, x19, x27\n"
- ".inst 0xc0042980 // mova za.d[x9, #0], { z12.d-z13.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0042981 // mova za.d[x9, #1], { z12.d-z13.d }\n"
- "mov x25, #0x2\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 6f\n"
- "cmp x20, x25\n"
- "csel x19, x20, x25, LT\n"
- "sub x20, x20, x19\n"
- "sub x25, x25, x19\n"
- "cbz x20, 6f\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "and x25, x20, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "sub x26, x26, x20\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "5:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 5b\n"
- "6:" // Left padding: End
- "adds XZR, x10, x11\n"
- "bne 11f\n"
- "cbz x25, 9f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 8f\n"
- "7:" // Unpadded: 2 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "8:" // Unpadded: 1 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "9:" // Unpadded: 0 priming loads
- "add x20, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "ld1b { z18.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "sub x26, x26, #0x1\n"
- "ld1b { z15.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "lsr x19, x28, #0x1\n"
- "ld1b { z17.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "cmp x19, x26\n"
- "ld1b { z16.s }, p5/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
- "csel x20, x19, x26, LT\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "10:" // Unpadded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "sub z18.h, z18.h, z21.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub z18.h, z18.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "sub z17.h, z17.h, z21.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "add x27, x27, %x[ld_in_col]\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "bgt 10b\n"
- "b 16f\n"
- "11:" // Padded
- "cbz x25, 14f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 13f\n"
- "12:" // Padded: 2 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "13:" // Padded: 1 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "14:" // Padded: 0 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "sub x26, x26, #0x1\n"
- "lsr x19, x28, #0x1\n"
- "mov z16.d, z16.d\n"
- "cmp x19, x26\n"
- "csel x20, x19, x26, LT\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "15:" // Padded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "bgt 15b\n"
- "16:" // Main loop tail
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "mov x12, #0x4\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "add x9, x9, #0x1\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "cbz x28, 17f\n" // Skip remainder inputs
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "sub x26, x26, #0x1\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "17:" // Tail input: End
- "cbz x26, 19f\n"
- "18:" // Right padding loop
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 18b\n"
- "19:" // End
- ".inst 0xd503467f // SMSTOP\n"
- :
- : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_n] "I" (offsetof(Args, n)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index bdf1ba6f9c..630d870433 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x9\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z5.h, p2/M, z5.h\n"
@@ -90,317 +90,317 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z0.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z0.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z24.h, z24.h, z13.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z17.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z24.h, z24.h, z13.h\n"
- "addvl x20, SP, #6\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
- "incw x21\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "incw x22\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z24.h, z24.h, z13.h\n"
"sub z25.h, z25.h, z13.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z10.h }, p2, [x21]\n"
"mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x20, x20, #-2\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z7.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x9\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046c00 // mova za.d[x11, #0], { z0.d-z3.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046c01 // mova za.d[x11, #1], { z0.d-z3.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x24], #0x10\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "sub x13, x13, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"9:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x15, #0x1\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "cmp x19, x13\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x22, x19, x13, LT\n"
+ "cmp x20, x13\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- "addvl x20, SP, #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z12.h, z12.h, z5.h\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x11, x11, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "ld1b { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
+ "add x27, x27, x25\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z13.h, z13.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
@@ -411,108 +411,108 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #4\n"
+ "addvl x20, SP, #4\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #2\n"
+ "addvl x20, SP, #2\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
@@ -521,357 +521,357 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "lsr x19, x15, #0x1\n"
- "cmp x19, x13\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
"mov z16.d, z16.d\n"
- "csel x21, x19, x13, LT\n"
+ "csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x21\n"
- "cbz x21, 17f\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
"mov x12, #0x8\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x19, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"mov z16.d, z16.d\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x21, x21, #0x1\n"
+ "subs x22, x22, #0x1\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z31.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x19, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #2\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"add x14, x14, %x[ld_in_col]\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #4\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 20b\n"
"21:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 4678e82f4e..2c19e232f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x8\n"
+ "mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x5\n"
+ "sub x20, x20, x4\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z25.h, p2/M, z25.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,298 +90,262 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z6.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z6.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x23\n"
- "ld1b { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "mov x22, x23\n"
+ "ld1b { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z2.h, #0x0\n"
"sub z18.h, z18.h, z12.h\n"
"incw x23\n"
- "ld1b { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z17.h, z17.h, z12.h\n"
"trn1 z0.h, z2.h, z18.h\n"
- "ld1b { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z16.h, z16.h, z12.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z15.s }, p2/Z, [x21]\n"
+ "ld1b { z15.s }, p2/Z, [x22]\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"trn1 z10.h, z16.h, z15.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1b { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1b { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
- "addvl x20, SP, #30\n"
- "ld1b { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "addvl x21, SP, #30\n"
+ "ld1b { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"incw x23\n"
"sub z16.h, z16.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1b { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z2.h, z18.h\n"
"incw x23\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1b { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1b { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x21]\n"
+ "ld1b { z15.s }, p2/Z, [x22]\n"
"sub z21.h, z21.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1b { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1b { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1b { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1b { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z21.h, z21.h, z12.h\n"
"sub z16.h, z16.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1b { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1b { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1b { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1b { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1b { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1b { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z15.s }, p2/Z, [x21]\n"
+ "ld1b { z15.s }, p2/Z, [x22]\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
"sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"mov z7.d, z6.d\n"
"trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
"trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "addvl x20, x20, #-6\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "addvl x21, x21, #-6\n"
"trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x16, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x7, x22, LSL #22\n"
- "mov x21, #0x8\n"
- "add x20, x6, x5\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x14, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x17, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x4\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x6, x15\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x15, x6, x19, x15\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
- "mov x21, #0x4\n"
- "ldp x13, x4, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x25], #0x10\n"
".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x10, x9, [x19], #0x10\n"
+ "ldp x3, x10, [x20], #0x10\n"
".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x28, x27, [x24], #0x10\n"
+ "ldp x9, x28, [x25], #0x10\n"
".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "sub x14, x14, x20\n"
+ "sub x15, x15, x21\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x5\n"
+ "adds XZR, x6, x4\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x19, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x21, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x20, SP, #18\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z16.h, z29.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
@@ -390,47 +354,37 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1b { z16.s }, p1/Z, [x21]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ "add z30.h, z30.h, z25.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x22, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x21, SP, #12\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
@@ -441,54 +395,44 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"add z30.h, z30.h, z25.h\n"
".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x23, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x22, SP, #6\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
@@ -504,121 +448,177 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z27.h, z27.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z17.h, z16.h\n"
+ "add z28.h, z28.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z17.h, z16.h\n"
+ "add z29.h, z29.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "add z30.h, z30.h, z25.h\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"12:" // Unpadded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "sub x16, x16, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cbz x17, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
- "sub x14, x14, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "cmp x16, x14\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x17, x15\n"
"add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x25, x17, x15, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z28.h, z28.h, z25.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub x14, x14, x24\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
"add z30.h, z30.h, z25.h\n"
- "cbz x24, 21f\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- "addvl x23, SP, #6\n"
+ "addvl x24, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x15]\n"
+ "addvl x23, SP, #12\n"
+ "ld1b { z23.s }, p1/Z, [x16]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "addvl x21, SP, #18\n"
- "addvl x20, SP, #24\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "subs x24, x24, #0x1\n"
- "add x15, x15, %x[ld_in_col]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
"trn1 z27.h, z23.h, z22.h\n"
@@ -645,407 +645,407 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
"add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z14.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x20, x15, %x[ld_in_row]\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #24\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x22, SP, #6\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "addvl x19, SP, #24\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"19:" // Padded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
+ "cbz x17, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "sub x14, x14, #0x1\n"
- "cmp x16, x14\n"
+ "sub x17, x17, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x17, x15\n"
"trn1 z29.h, z19.h, z18.h\n"
"trn1 z30.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "add x15, x15, %x[ld_in_col]\n"
- "sub x14, x14, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x17, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x15]\n"
+ "ld1b { z23.s }, p0/Z, [x16]\n"
"add z23.h, p0/M, z23.h, z25.h\n"
- "add x23, x15, %x[ld_in_row]\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x23]\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #12\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
"add z22.h, p0/M, z22.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x23]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
"add z21.h, p0/M, z21.h, z25.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x23]\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
"add z20.h, p0/M, z20.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x24, x24, #0x1\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x23]\n"
+ "ld1b { z19.s }, p0/Z, [x24]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x23]\n"
+ "ld1b { z18.s }, p0/Z, [x24]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x23]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x23]\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
@@ -1069,56 +1069,56 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 20b\n"
"21:" // Main loop tail
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x21, SP, #12\n"
+ "addvl x22, SP, #12\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
@@ -1135,20 +1135,20 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"22:" // Main loop skip tail
- "cbz x14, 24f\n"
+ "cbz x15, 24f\n"
"23:" // Right padding loop
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
- "subs x14, x14, #0x1\n"
+ "subs x15, x15, #0x1\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
"add x11, x11, #0x2\n"
@@ -1157,44 +1157,44 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 23b\n"
"24:" // End
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
"incw x23, ALL, MUL #16\n"
"incw x23, ALL, MUL #9\n"
"str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x15, x15, x19\n"
- "str x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x5\n"
+ "whilelt p1.s, x5, x7\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x16, x16, x20\n"
+ "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 84e8c8bea8..468e6778a4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0xb\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x4\n"
+ "sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z9.h, p2/M, z9.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,310 +90,227 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z28.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z28.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1b { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z12.h, z12.h, z18.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z14.h, #0x0\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "addvl x20, SP, #15\n"
- "ld1b { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, SP, #15\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
"sub z16.h, z16.h, z18.h\n"
- "incw x21\n"
- "ld1b { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, x21, #-3\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "ld1b { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1b { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
- "incw x21\n"
- "ld1b { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
"sub z25.h, z25.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
- "incw x21\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"sub z16.h, z16.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1b { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "ld1b { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1b { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1b { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"sub z24.h, z24.h, z18.h\n"
- "ld1b { z16.s }, p2/Z, [x19]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
"sub z17.h, z17.h, z18.h\n"
"sub z16.h, z16.h, z18.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x20, x20, #-3\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z2.h }, p2, [x21]\n"
"mov z29.d, z28.d\n"
"mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"mov z31.d, z28.d\n"
"trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "addvl x20, x20, #-3\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
"trn1 z10.h, z24.h, z17.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x17, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x6, x22, LSL #22\n"
- "mov x21, #0xb\n"
- "add x20, x5, x4\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x8, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x5, x16\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x16, x5, x19, x16\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
- "mov x21, #0x4\n"
- "ldp x14, x13, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x25], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x11, x10, [x19], #0x10\n"
+ "ldp x13, x11, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x9, x28, [x24], #0x10\n"
- "ldp x27, x26, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "ldp x10, x9, [x25], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "sub x15, x15, x20\n"
+ "sub x16, x16, x21\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x4, x3\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #6\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
"ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #12\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -407,40 +324,34 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #3\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
"ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #9\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -454,127 +365,100 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "12:" // Unpadded: 0 priming loads
- "cmp x17, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "blt 22f\n"
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z21.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x17, #0x1\n"
"add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "cmp x19, x15\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x25, x19, x15, LT\n"
"add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "and x17, x17, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- "sub x15, x15, x25\n"
- "cbz x25, 21f\n"
- "13:" // Unpadded: Main loop
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- "add x22, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "addvl x20, SP, #9\n"
- "subs x25, x25, #0x1\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "add x19, x16, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
"ld1b { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "addvl x20, SP, #9\n"
"ld1b { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
"ld1b { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
"ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
@@ -583,50 +467,166 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"add z16.h, z16.h, z9.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "ld1b { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "ld1b { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "sub x16, x16, #0x1\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x28, x28, x26\n"
- "ld1b { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cmp x20, x16\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z15.h, z15.h, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ "add z16.h, z16.h, z9.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1b { z12.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1b { z13.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z19.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, z15.h, z9.h\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add x9, x9, x27\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z13.h, z13.h, z9.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z9.h\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
@@ -634,717 +634,717 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #12\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #9\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #12\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #3\n"
+ "addvl x21, SP, #3\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #9\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x17, #0x2\n"
+ "cmp x7, #0x2\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
- "sub x17, x17, #0x2\n"
- "sub x15, x15, #0x1\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "lsr x19, x17, #0x1\n"
- "cmp x19, x15\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "csel x24, x19, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "and x17, x17, #0x1\n"
- "sub x15, x15, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x19, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #9\n"
- "subs x24, x24, #0x1\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"mov x12, #0x4\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z17.h, p0/M, z17.h, z9.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x21, x16, %x[ld_in_row]\n"
- "addvl x20, SP, #3\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x19, SP, #9\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x22]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"22:" // Main loop skip tail
- "cbz x17, 23f\n" // Skip remainder inputs
+ "cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #12\n"
- "sub x15, x15, #0x1\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"23:" // Tail input: End
- "cbz x15, 25f\n"
+ "cbz x16, 25f\n"
"24:" // Right padding loop
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add x8, x8, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "subs x15, x15, #0x1\n"
+ "subs x16, x16, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #16\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x19\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #16\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x17, x17, x20\n"
+ "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index ad765ba659..1636225b31 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x6\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z24.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-12\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z24.h, p2/M, z24.h\n"
@@ -90,377 +90,377 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"ld1rw { z22.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z8.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z20.h, #0x0\n"
"sub z27.h, z27.h, z21.h\n"
- "incw x21\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "incw x22\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z23.h, z23.h, z21.h\n"
"trn1 z0.h, z20.h, z27.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z21.h\n"
- "addvl x20, SP, #12\n"
- "incw x21\n"
- "addvl x20, x20, #-4\n"
- "mov x19, x21\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z20.h, z27.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z1.h, z27.h, z23.h\n"
- "ld1sb { z27.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z2.h, z23.h, z16.h\n"
- "ld1sb { z23.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z3.h, z16.h, z20.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z27.h, z27.h, z21.h\n"
"sub z23.h, z23.h, z21.h\n"
- "addvl x20, x20, #-4\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z21.h\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
"mov z9.d, z8.d\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z20.h, z27.h\n"
"trn1 z1.h, z27.h, z23.h\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "addvl x20, x20, #-4\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
"trn1 z2.h, z23.h, z16.h\n"
"trn1 z3.h, z16.h, z20.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z2.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z10.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z2.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z3.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z10.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z11.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z11.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x6\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046900 // mova za.d[x11, #0], { z8.d-z9.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046901 // mova za.d[x11, #1], { z8.d-z9.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046902 // mova za.d[x11, #2], { z8.d-z9.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- ".inst 0xc0046903 // mova za.d[x11, #3], { z8.d-z9.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "ldp x26, x25, [x24], #0x10\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "sub x13, x13, x20\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040900 // mova za.d[x8, #0], { z8.d-z9.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040901 // mova za.d[x8, #1], { z8.d-z9.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040902 // mova za.d[x8, #2], { z8.d-z9.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040903 // mova za.d[x8, #3], { z8.d-z9.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #8\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z17.h, z16.h\n"
- "add z13.h, z13.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z17.h, z16.h\n"
- "add z14.h, z14.h, z24.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z15.h, z17.h, z16.h\n"
- "add z15.h, z15.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- "9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
- "addvl x20, SP, #4\n"
+ "addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"add z13.h, z13.h, z24.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #8\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
"add z14.h, z14.h, z24.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x14, x14, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z17.h, z16.h\n"
+ "add z13.h, z13.h, z24.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z17.h, z16.h\n"
+ "add z14.h, z14.h, z24.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "trn1 z15.h, z17.h, z16.h\n"
+ "add z15.h, z15.h, z24.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"10:" // Unpadded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z17.h, z16.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
"add z13.h, z13.h, z24.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z14.h, z14.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
"add z15.h, z15.h, z24.h\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x21, SP, #4\n"
- "addvl x20, SP, #8\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
"ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
"trn1 z13.h, z21.h, z20.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z4.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"add z13.h, z13.h, z24.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- "st1b { z5.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"add z14.h, z14.h, z24.h\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add z15.h, z15.h, z24.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #8\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
"add z16.h, p0/M, z16.h, z24.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
"trn1 z15.h, z17.h, z16.h\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
"15:" // Padded: 0 priming loads
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
@@ -469,192 +469,192 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z19.s }, p0/Z, [x14]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z19.h, z18.h\n"
"trn1 z14.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"sub x15, x15, #0x1\n"
"sub x13, x13, #0x1\n"
"cmp x15, x13\n"
"trn1 z15.h, z17.h, z16.h\n"
- "csel x22, x15, x13, LT\n"
+ "csel x23, x15, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z21.s }, p0/Z, [x14]\n"
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
"add z21.h, p0/M, z21.h, z24.h\n"
- "add x21, x14, %x[ld_in_row]\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
"add z20.h, p0/M, z20.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z24.h\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x20, SP, #4\n"
+ "addvl x21, SP, #4\n"
"add z18.h, p0/M, z18.h, z24.h\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z24.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
"add z16.h, p0/M, z16.h, z24.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z4.s }, p1, [x10]\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
"trn1 z13.h, z21.h, z20.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
".inst 0xa0412be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"trn1 z14.h, z19.h, z18.h\n"
"trn1 z15.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16175a8 // sdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- "addvl x20, SP, #4\n"
- "addvl x19, SP, #8\n"
- ".inst 0xc16075a9 // sdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16375c8 // sdot za.s[x11, 0], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275c9 // sdot za.s[x11, 1], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc16115a8 // sdot za.s[x8, 0], { z13.h-z14.h }, z1.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16015a9 // sdot za.s[x8, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16315c8 // sdot za.s[x8, 0], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215c9 // sdot za.s[x8, 1], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- ".inst 0xc16175aa // sdot za.s[x11, 2], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115aa // sdot za.s[x8, 2], { z13.h-z14.h }, z1.h\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc16075ab // sdot za.s[x11, 3], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0402a60 // ld1h { z0.h-z1.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc16015ab // sdot za.s[x8, 3], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc16175ac // sdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc16115ac // sdot za.s[x8, 4], { z13.h-z14.h }, z1.h\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16015ad // sdot za.s[x8, 5], { z13.h-z14.h }, z0.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16075ad // sdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc16375ca // sdot za.s[x11, 2], { z14.h-z15.h }, z3.h\n"
- "st1b { z5.s }, p1, [x26]\n"
+ ".inst 0xc16315ca // sdot za.s[x8, 2], { z14.h-z15.h }, z3.h\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16215cb // sdot za.s[x8, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16275cb // sdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0412a62 // ld1h { z2.h-z3.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- ".inst 0xc16375cc // sdot za.s[x11, 4], { z14.h-z15.h }, z3.h\n"
- ".inst 0xc16275cd // sdot za.s[x11, 5], { z14.h-z15.h }, z2.h\n"
- "add x11, x11, #0x2\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc16315cc // sdot za.s[x8, 4], { z14.h-z15.h }, z3.h\n"
+ ".inst 0xc16215cd // sdot za.s[x8, 5], { z14.h-z15.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
".inst 0xc1aaac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z10.s\n"
- "add x11, x11, #0x2\n"
+ "add x8, x8, #0x2\n"
".inst 0xc1abaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
- ".inst 0xc0046904 // mova za.d[x11, #4], { z8.d-z9.d }\n"
+ ".inst 0xc0040904 // mova za.d[x8, #4], { z8.d-z9.d }\n"
".inst 0xc1acab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0046905 // mova za.d[x11, #5], { z8.d-z9.d }\n"
+ ".inst 0xc0040905 // mova za.d[x8, #5], { z8.d-z9.d }\n"
".inst 0xc1bacec4 // sclamp { z4.s-z7.s }, z22.s, z26.s\n"
- "st1b { z4.s }, p1, [x10]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z5.s }, p1, [x26]\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z7.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 19b\n"
"20:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za/generic.cpp
deleted file mode 100644
index 328227f91a..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za/generic.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ARM_COMPUTE_ENABLE_SME2)
-
-#include <algorithm>
-#include <cstddef>
-#include "arm_gemm.hpp"
-
-using arm_gemm::Requantize32;
-
-namespace arm_conv {
-namespace depthwise {
-
-void sme2_u8s8u8q_planar_3x3_s2_2rows_dot_za_impl(
- const uint8_t *inptr,
- size_t ld_in_row,
- size_t ld_in_col,
- unsigned int pad_top,
- unsigned int valid_input_rows,
- unsigned int pad_left,
- unsigned int valid_input_cols,
- const int8_t *weights,
- uint8_t **outptrs,
- const size_t *outlds,
- unsigned int output_cols,
- unsigned int start_channel,
- unsigned int valid_channels,
- const arm_gemm::Requantize32 &qp
-)
-{
- struct Args
- {
- const uint8_t *inptr;
- long unsigned int pad_top, pad_bottom, pad_left;
- const int8_t *weights;
- long unsigned int input_cols, output_cols;
- uint8_t **outptrs;
- const size_t *ld_out_cols;
- long unsigned int n, n_channels;
- };
-
- Args args = { inptr, pad_top, 5u - std::min(5u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, start_channel, valid_channels };
-
- __asm__ __volatile__(
- "ldr x11, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x19, #0x5\n"
- ".inst 0xd503477f // SMSTART ZA\n"
- "sub x19, x19, x11\n"
- "ldr x10, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ptrue p0.b\n"
- "mov z12.s, #0x0\n"
- "ldr x22, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p5.s, XZR, x22\n"
- "whilelt p9.s, XZR, x19\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "whilelt p8.s, XZR, x10\n"
- "eor p8.b, p0/Z, p8.b, p9.b\n"
- "ldr x21, [%x[args], %[offsetof_Args_n]]\n"
- "cbz x19, 1f\n"
- "ld1w { z12.s }, p5/Z, [x19, x21, LSL #2]\n"
- "1:" // Load bias: Done
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "mov z0.h, #0x0\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "mov z13.d, z12.d\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "incw x20\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "ld1rh { z28.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z27.h, z27.h, z28.h\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z21.h, z21.h, z28.h\n"
- "trn1 z8.h, z27.h, z21.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z7.h, z16.h, z20.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "sub z24.h, z24.h, z28.h\n"
- "trn1 z6.h, z17.h, z0.h\n"
- "sub z19.h, z19.h, z28.h\n"
- "trn1 z5.h, z24.h, z0.h\n"
- "trn1 z4.h, z22.h, z18.h\n"
- "trn1 z3.h, z19.h, z0.h\n"
- "ld1rh { z21.h }, p0/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "ld1rw { z2.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z1.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "cbz x19, 2f\n"
- "ld1w { z1.s }, p5/Z, [x19, x21, LSL #2]\n"
- "2:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "ld1rw { z0.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "cbz x19, 3f\n"
- "ld1w { z0.s }, p5/Z, [x19, x21, LSL #2]\n"
- "3:" // Load right_shift: End
- "ldr x28, [%x[args], %[offsetof_Args_input_cols]]\n"
- "orr x21, x28, %x[ld_in_col], LSL #16\n"
- "orr x21, x22, x21, LSL #22\n"
- "ld1rw { z20.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ldr x27, [%x[args], %[offsetof_Args_inptr]]\n"
- "mov x20, #0x5\n"
- "add x19, x10, x11\n"
- "ld1rw { z19.s }, p0/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov x9, #0x0\n"
- "ldr x26, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x21, x21, #0x0\n"
- "sub x20, x20, x19\n"
- "mov x19, x27\n"
- "4:" // Issue prefetches
- "subs x20, x20, #0x1\n"
- ".inst 0xf8b54a7c // rprfm pldstrm, x21, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
- "bgt 4b\n"
- "ldr x21, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x27, x10, x19, x27\n"
- ".inst 0xc0042980 // mova za.d[x9, #0], { z12.d-z13.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0042981 // mova za.d[x9, #1], { z12.d-z13.d }\n"
- "mov x25, #0x2\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "ldp x24, x23, [x21], #0x10\n"
- "ldp x22, x21, [x19], #0x10\n"
- "cbz x20, 6f\n"
- "cmp x20, x25\n"
- "csel x19, x20, x25, LT\n"
- "sub x20, x20, x19\n"
- "sub x25, x25, x19\n"
- "cbz x20, 6f\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "and x25, x20, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "sub x26, x26, x20\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "5:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 5b\n"
- "6:" // Left padding: End
- "adds XZR, x10, x11\n"
- "bne 11f\n"
- "cbz x25, 9f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 8f\n"
- "7:" // Unpadded: 2 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "8:" // Unpadded: 1 priming loads
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "9:" // Unpadded: 0 priming loads
- "add x20, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "ld1b { z18.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z18.h, z18.h, z21.h\n"
- "sub x26, x26, #0x1\n"
- "ld1b { z15.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "lsr x19, x28, #0x1\n"
- "ld1b { z17.s }, p5/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "cmp x19, x26\n"
- "ld1b { z16.s }, p5/Z, [x20]\n"
- "sub z16.h, z16.h, z21.h\n"
- "csel x20, x19, x26, LT\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "10:" // Unpadded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- "sub z14.h, z14.h, z21.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "sub z18.h, z18.h, z21.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z17.h, z17.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1b { z14.s }, p5/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "ld1b { z18.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- "ld1b { z15.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, z14.h, z21.h\n"
- "sub z18.h, z18.h, z21.h\n"
- "ld1b { z17.s }, p5/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z15.h, z15.h, z21.h\n"
- "sub z17.h, z17.h, z21.h\n"
- "ld1b { z16.s }, p5/Z, [x19]\n"
- "sub z16.h, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "add x27, x27, %x[ld_in_col]\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "bgt 10b\n"
- "b 16f\n"
- "11:" // Padded
- "cbz x25, 14f\n"
- "cmp x25, #0x1\n"
- "sub x28, x28, x25\n"
- "beq 13f\n"
- "12:" // Padded: 2 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16835c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z8.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16635e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z6.h\n"
- "13:" // Padded: 1 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "14:" // Padded: 0 priming loads
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "sub x28, x28, #0x2\n"
- "sub x26, x26, #0x1\n"
- "lsr x19, x28, #0x1\n"
- "mov z16.d, z16.d\n"
- "cmp x19, x26\n"
- "csel x20, x19, x26, LT\n"
- "add x27, x27, %x[ld_in_col]\n"
- "and x28, x28, #0x1\n"
- "sub x26, x26, x20\n"
- "cbz x20, 16f\n"
- "15:" // Padded: Main loop
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "add x9, x9, #0x1\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "add x19, x27, %x[ld_in_row]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "subs x20, x20, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "mov z16.d, z16.d\n"
- "add x27, x27, %x[ld_in_col]\n"
- "bgt 15b\n"
- "16:" // Main loop tail
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "mov x12, #0x4\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "add x9, x9, #0x1\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add x27, x27, %x[ld_in_col]\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16735c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z7.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc16535e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z5.h\n"
- "cbz x28, 17f\n" // Skip remainder inputs
- "mov x12, #0x0\n"
- ".inst 0x25305504 // psel p4.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p4/Z, [x27]\n"
- "sub z14.h, p4/M, z14.h, z21.h\n"
- "add x19, x27, %x[ld_in_row]\n"
- ".inst 0x25705503 // psel p3.s, p5.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p3/Z, [x19]\n"
- "sub z18.h, p3/M, z18.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25b05502 // psel p2.s, p5.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p2/Z, [x19]\n"
- "sub z15.h, p2/M, z15.h, z21.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0x25f05501 // psel p1.s, p5.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "sub z17.h, p1/M, z17.h, z21.h\n"
- "mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "trn1 z15.h, z15.h, z17.h\n"
- ".inst 0x25305500 // psel p0.s, p5.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "sub z16.h, p0/M, z16.h, z21.h\n"
- "mov z16.d, z16.d\n"
- ".inst 0xc16435c8 // sdot za.s[x9, 0], { z14.h-z15.h }, z4.h\n"
- "sub x26, x26, #0x1\n"
- ".inst 0xc16335e8 // sdot za.s[x9, 0], { z15.h-z16.h }, z3.h\n"
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- ".inst 0xc16835c9 // sdot za.s[x9, 1], { z14.h-z15.h }, z8.h\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc16635e9 // sdot za.s[x9, 1], { z15.h-z16.h }, z6.h\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "17:" // Tail input: End
- "cbz x26, 19f\n"
- "18:" // Right padding loop
- ".inst 0xc0062818 // mova { z24.d-z25.d }, za.d[x9, #0]\n"
- ".inst 0xc1a1a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
- "add x9, x9, #0x1\n"
- ".inst 0xc1a0a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc0042982 // mova za.d[x9, #2], { z12.d-z13.d }\n"
- ".inst 0xc1a2a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z2.s\n"
- ".inst 0xc1b3c698 // sclamp { z24.s-z25.s }, z20.s, z19.s\n"
- "st1b { z24.s }, p5, [x24]\n"
- "add x24, x24, x22\n"
- "st1b { z25.s }, p5, [x23]\n"
- "add x23, x23, x21\n"
- "bgt 18b\n"
- "19:" // End
- ".inst 0xd503467f // SMSTOP\n"
- :
- : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_n] "I" (offsetof(Args, n)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 7a9724c667..2848a015db 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,18 +69,18 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x9\n"
- "ldr x8, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z5.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x7\n"
+ "sub x20, x20, x6\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x8\n"
+ "whilelt p8.s, XZR, x7\n"
"addvl SP, SP, #-6\n"
"ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z5.h, p2/M, z5.h\n"
@@ -90,317 +90,317 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z0.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z0.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"ld1rh { z13.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z24.h, z24.h, z13.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z17.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "mov x19, x21\n"
+ "mov x20, x22\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z24.h, z24.h, z13.h\n"
- "addvl x20, SP, #6\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
"sub z25.h, z25.h, z13.h\n"
- "incw x21\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "incw x22\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z10.h, z24.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #3\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z24.h, z24.h, z13.h\n"
"sub z25.h, z25.h, z13.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
"sub z16.h, z16.h, z13.h\n"
- "addvl x20, x20, #-2\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z10.h }, p2, [x21]\n"
"mov z1.d, z0.d\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "addvl x20, x20, #-2\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"trn1 z10.h, z24.h, z25.h\n"
- "st1h { z10.h }, p2, [x20]\n"
+ "st1h { z10.h }, p2, [x21]\n"
"trn1 z11.h, z16.h, z17.h\n"
- "st1h { z11.h }, p2, [x20, #1, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z8.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "st1h { z11.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z8.s }, p1/Z, [x20, x16, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z7.s }, p1/Z, [x19, x16, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z7.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x15, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x17, x22, LSL #22\n"
- "mov x21, #0x9\n"
- "add x20, x8, x7\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "mov x11, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x8, x14\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x14, x8, x19, x14\n"
- ".inst 0xc0046c00 // mova za.d[x11, #0], { z0.d-z3.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046c01 // mova za.d[x11, #1], { z0.d-z3.d }\n"
- "mov x21, #0x2\n"
- "ldp x10, x9, [x24], #0x10\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ldp x28, x27, [x19], #0x10\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
- "ldp x26, x25, [x24], #0x10\n"
- "ldp x24, x23, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040c00 // mova za.d[x8, #0], { z0.d-z3.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040c01 // mova za.d[x8, #1], { z0.d-z3.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x25], #0x10\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x25], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "sub x13, x13, x20\n"
+ "sub x13, x13, x21\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x8, x7\n"
+ "adds XZR, x7, x6\n"
"bne 12f\n"
- "cbz x21, 10f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #4\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"9:" // Unpadded: 1 priming loads
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- "addvl x19, SP, #2\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x15, #0x1\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "cmp x19, x13\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x22, x19, x13, LT\n"
+ "cmp x20, x13\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- "addvl x20, SP, #2\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x19, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z12.h, z12.h, z5.h\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z5.h\n"
- "ld1b { z14.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x11, x11, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z5.h\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"ld1b { z12.s }, p1/Z, [x14]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "ld1b { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "ld1b { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
+ "add x27, x27, x25\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z12.h, z12.h, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z13.h, z13.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
@@ -411,108 +411,108 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
- "cbz x21, 15f\n"
- "cmp x21, #0x1\n"
- "sub x15, x15, x21\n"
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #4\n"
+ "addvl x20, SP, #4\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
- "addvl x19, SP, #2\n"
+ "addvl x20, SP, #2\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
@@ -521,357 +521,357 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "lsr x19, x15, #0x1\n"
- "cmp x19, x13\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
"mov z16.d, z16.d\n"
- "csel x21, x19, x13, LT\n"
+ "csel x22, x20, x13, LT\n"
"add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x21\n"
- "cbz x21, 17f\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
"16:" // Padded: Main loop
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
"mov x12, #0x8\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x19, SP, #2\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
"mov x12, #0x0\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"mov z16.d, z16.d\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x21, x21, #0x1\n"
+ "subs x22, x22, #0x1\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z29.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z30.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
+ "st1b { z31.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
"add x14, x14, %x[ld_in_col]\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
- "addvl x19, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x19, x14, %x[ld_in_row]\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
"add z19.h, p0/M, z19.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z17.h, p0/M, z17.h, z5.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #2\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"add x14, x14, %x[ld_in_col]\n"
"trn1 z15.h, z15.h, z17.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
".inst 0xa0402bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z12.s }, p0/Z, [x14]\n"
"add z12.h, p0/M, z12.h, z5.h\n"
- "add x19, x14, %x[ld_in_row]\n"
+ "add x20, x14, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z5.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z17.h, p0/M, z17.h, z5.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z5.h\n"
"trn1 z14.h, z14.h, z18.h\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #4\n"
- ".inst 0xc17a7588 // sdot za.s[x11, 0], { z12.h-z15.h }, z10.h\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"sub x13, x13, #0x1\n"
- ".inst 0xc17b75a8 // sdot za.s[x11, 0], { z13.h-z16.h }, z11.h\n"
- ".inst 0xa0402a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
- ".inst 0xc17a7589 // sdot za.s[x11, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- ".inst 0xc17b75a9 // sdot za.s[x11, 1], { z13.h-z16.h }, z11.h\n"
- "add x11, x11, #0x1\n"
+ ".inst 0xc17b15a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z11.h\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0066c1c // mova { z28.d-z31.d }, za.d[x11, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
- "add x11, x11, #0x1\n"
+ "add x8, x8, #0x1\n"
".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0046c02 // mova za.d[x11, #2], { z0.d-z3.d }\n"
+ ".inst 0xc0040c02 // mova za.d[x8, #2], { z0.d-z3.d }\n"
".inst 0xc1a4ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
".inst 0xc1b7cf7c // sclamp { z28.s-z31.s }, z27.s, z23.s\n"
- "st1b { z28.s }, p1, [x10]\n"
+ "st1b { z28.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z29.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z29.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "st1b { z30.s }, p1, [x26]\n"
+ "st1b { z30.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z31.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "st1b { z31.s }, p1, [x25]\n"
- "add x25, x25, x23\n"
"bgt 20b\n"
"21:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
"incw x16\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
"whilelt p1.s, x16, x17\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x14, x14, x19\n"
+ "add x14, x14, x20\n"
"str x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index d6970647d2..3e77c75ad7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0x8\n"
+ "mov x20, #0x8\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z25.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x5\n"
+ "sub x20, x20, x4\n"
".inst 0x25207812 // ptrue pn10.b\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
"whilelt p1.s, XZR, x7\n"
- "whilelt p9.s, XZR, x19\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
"whilelt p8.s, XZR, x6\n"
"addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z25.h, p2/M, z25.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,298 +90,262 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z6.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z6.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x5, LSL #2]\n"
"2:" // Load bias: Done
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x21, x23\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "mov x22, x23\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"ld1rh { z12.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"mov z2.h, #0x0\n"
"sub z18.h, z18.h, z12.h\n"
"incw x23\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z17.h, z17.h, z12.h\n"
"trn1 z0.h, z2.h, z18.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z16.h, z16.h, z12.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"trn1 z10.h, z16.h, z15.h\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"sub z21.h, z21.h, z12.h\n"
- "addvl x20, SP, #30\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
+ "addvl x21, SP, #30\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
"incw x23\n"
"sub z16.h, z16.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z15.h, z15.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"trn1 z0.h, z2.h, z18.h\n"
"incw x23\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z21.h, z21.h, z12.h\n"
- "mov x21, x23\n"
+ "mov x22, x23\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"incw x23\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
"sub z18.h, z18.h, z12.h\n"
"sub z17.h, z17.h, z12.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
- "addvl x20, x20, #-6\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
+ "addvl x21, x21, #-6\n"
"sub z21.h, z21.h, z12.h\n"
"sub z16.h, z16.h, z12.h\n"
- "mov x21, x23\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "mov x22, x23\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z0.h, z2.h, z18.h\n"
"trn1 z8.h, z18.h, z17.h\n"
- "ld1sb { z18.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z4.h, z17.h, z21.h\n"
- "ld1sb { z17.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
- "ld1sb { z21.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "ld1sb { z21.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z10.h, z16.h, z15.h\n"
- "ld1sb { z16.s }, p2/Z, [x21]\n"
- "incw x21, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x22]\n"
+ "incw x22, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
"trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z15.s }, p2/Z, [x21]\n"
+ "ld1sb { z15.s }, p2/Z, [x22]\n"
"sub z18.h, z18.h, z12.h\n"
- "addvl x20, x20, #-6\n"
+ "addvl x21, x21, #-6\n"
"sub z17.h, z17.h, z12.h\n"
"sub z21.h, z21.h, z12.h\n"
- "st1h { z0.h }, p2, [x20]\n"
+ "st1h { z0.h }, p2, [x21]\n"
"sub z16.h, z16.h, z12.h\n"
"sub z15.h, z15.h, z12.h\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
"mov z7.d, z6.d\n"
"trn1 z0.h, z2.h, z18.h\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
"trn1 z8.h, z18.h, z17.h\n"
"trn1 z4.h, z17.h, z21.h\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
"trn1 z5.h, z21.h, z16.h\n"
"trn1 z10.h, z16.h, z15.h\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "addvl x20, x20, #-6\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "addvl x21, x21, #-6\n"
"trn1 z11.h, z15.h, z2.h\n"
- "st1h { z0.h }, p2, [x20]\n"
- "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z4.h }, p2, [x20, #2, MUL VL]\n"
- "st1h { z5.h }, p2, [x20, #3, MUL VL]\n"
- "st1h { z10.h }, p2, [x20, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x20, #5, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "st1h { z0.h }, p2, [x21]\n"
+ "st1h { z8.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x21, #3, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x21, #5, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x5, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x17, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x5, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x16, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x7, x22, LSL #22\n"
- "mov x21, #0x8\n"
- "add x20, x6, x5\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x14, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x17, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x4\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x6, x15\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x15, x6, x19, x15\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
".inst 0xc00468c0 // mova za.d[x11, #0], { z6.d-z7.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc00468c1 // mova za.d[x11, #1], { z6.d-z7.d }\n"
- "mov x21, #0x4\n"
- "ldp x13, x4, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x25], #0x10\n"
".inst 0xc00468c2 // mova za.d[x11, #2], { z6.d-z7.d }\n"
- "ldp x10, x9, [x19], #0x10\n"
+ "ldp x3, x10, [x20], #0x10\n"
".inst 0xc00468c3 // mova za.d[x11, #3], { z6.d-z7.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc00468c4 // mova za.d[x11, #4], { z6.d-z7.d }\n"
- "ldp x28, x27, [x24], #0x10\n"
+ "ldp x9, x28, [x25], #0x10\n"
".inst 0xc00468c5 // mova za.d[x11, #5], { z6.d-z7.d }\n"
- "ldp x26, x25, [x19], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
".inst 0xc00468c6 // mova za.d[x11, #6], { z6.d-z7.d }\n"
".inst 0xc00468c7 // mova za.d[x11, #7], { z6.d-z7.d }\n"
".inst 0xc00408c0 // mova za.d[x8, #0], { z6.d-z7.d }\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
- "sub x14, x14, x20\n"
+ "sub x15, x15, x21\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x6, x5\n"
+ "adds XZR, x6, x4\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x19, SP, #24\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z17.h, z16.h\n"
- "add z27.h, z27.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z17.h, z16.h\n"
- "add z28.h, z28.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z29.h, z16.h, z29.h\n"
- "add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z25.h\n"
- ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x21, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x20, SP, #18\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z17.h, z16.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z16.h, z29.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
@@ -390,47 +354,37 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"ld1b { z16.s }, p1/Z, [x21]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add z30.h, z30.h, z25.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ "add z30.h, z30.h, z25.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
- ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x22, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x21, SP, #12\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x15, x15, %x[ld_in_col]\n"
".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
@@ -441,54 +395,44 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"add z30.h, z30.h, z25.h\n"
".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
- ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x23, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "addvl x22, SP, #6\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
"add z27.h, z27.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x21, SP, #12\n"
+ "addvl x21, SP, #18\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
"add z28.h, z28.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x20, SP, #18\n"
+ "addvl x20, SP, #24\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
"ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "addvl x19, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x23]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
@@ -504,121 +448,177 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z27.h, z27.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z17.h, z16.h\n"
+ "add z28.h, z28.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z17.h, z16.h\n"
+ "add z29.h, z29.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x24]\n"
+ "trn1 z30.h, z17.h, z16.h\n"
+ ".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
+ "add z30.h, z30.h, z25.h\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
+ ".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"12:" // Unpadded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x15]\n"
- "sub x16, x16, #0x1\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cbz x17, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z17.h, z16.h\n"
- "sub x14, x14, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "cmp x16, x14\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x17, x15\n"
"add z27.h, z27.h, z25.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z28.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "csel x25, x17, x15, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z28.h, z28.h, z25.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z29.h, z17.h, z16.h\n"
"add z29.h, z29.h, z25.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "sub x14, x14, x24\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
"add z30.h, z30.h, z25.h\n"
- "cbz x24, 21f\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- "addvl x23, SP, #6\n"
+ "addvl x24, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #12\n"
- "ld1b { z23.s }, p1/Z, [x15]\n"
+ "addvl x23, SP, #12\n"
+ "ld1b { z23.s }, p1/Z, [x16]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
- "addvl x21, SP, #18\n"
- "addvl x20, SP, #24\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x19, x15, %x[ld_in_row]\n"
- "ld1b { z22.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "subs x24, x24, #0x1\n"
- "add x15, x15, %x[ld_in_col]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- "ld1b { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xa0412b04 // ld1h { z4.h-z5.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa0422b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
"trn1 z27.h, z23.h, z22.h\n"
@@ -645,407 +645,407 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
"add z30.h, z30.h, z25.h\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z14.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x16, x16, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x20, x15, %x[ld_in_row]\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"trn1 z30.h, z17.h, z16.h\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #24\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
"trn1 z30.h, z17.h, z16.h\n"
- "add x15, x15, %x[ld_in_col]\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x20, SP, #18\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
- "addvl x22, SP, #6\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
"trn1 z29.h, z18.h, z16.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x21, SP, #12\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
"trn1 z30.h, z17.h, z16.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
- "addvl x19, SP, #24\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
"19:" // Padded: 0 priming loads
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "cbz x16, 22f\n"
+ "cbz x17, 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x15]\n"
+ "ld1b { z19.s }, p0/Z, [x16]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x15, %x[ld_in_row]\n"
+ "add x20, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z27.h, z19.h, z18.h\n"
"trn1 z28.h, z17.h, z16.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "sub x14, x14, #0x1\n"
- "cmp x16, x14\n"
+ "sub x17, x17, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x17, x15\n"
"trn1 z29.h, z19.h, z18.h\n"
"trn1 z30.h, z17.h, z16.h\n"
- "csel x24, x16, x14, LT\n"
- "add x15, x15, %x[ld_in_col]\n"
- "sub x14, x14, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x17, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x15]\n"
+ "ld1b { z23.s }, p0/Z, [x16]\n"
"add z23.h, p0/M, z23.h, z25.h\n"
- "add x23, x15, %x[ld_in_row]\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x23]\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #12\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
"add z22.h, p0/M, z22.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
- "ld1b { z21.s }, p0/Z, [x23]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
"add z21.h, p0/M, z21.h, z25.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
"mov x12, #0x4\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
- "ld1b { z20.s }, p0/Z, [x23]\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
"add z20.h, p0/M, z20.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x24, x24, #0x1\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
- "ld1b { z19.s }, p0/Z, [x23]\n"
+ "ld1b { z19.s }, p0/Z, [x24]\n"
"add z19.h, p0/M, z19.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x15, x15, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- "ld1b { z18.s }, p0/Z, [x23]\n"
+ "ld1b { z18.s }, p0/Z, [x24]\n"
"add z18.h, p0/M, z18.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
- "ld1b { z17.s }, p0/Z, [x23]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
"add z17.h, p0/M, z17.h, z25.h\n"
- "add x23, x23, %x[ld_in_row]\n"
+ "add x24, x24, %x[ld_in_row]\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
- "ld1b { z16.s }, p0/Z, [x23]\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
"add z16.h, p0/M, z16.h, z25.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
@@ -1069,56 +1069,56 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 20b\n"
"21:" // Main loop tail
- "addvl x22, SP, #6\n"
+ "addvl x23, SP, #6\n"
".inst 0xc1687768 // sdot za.s[x11, 0], { z27.h-z28.h }, z8.h\n"
- "addvl x21, SP, #12\n"
+ "addvl x22, SP, #12\n"
".inst 0xc1607769 // sdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #18\n"
- "addvl x19, SP, #24\n"
+ ".inst 0xa1402ae0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
".inst 0xc168776a // sdot za.s[x11, 2], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776b // sdot za.s[x11, 3], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
".inst 0xc1657788 // sdot za.s[x11, 0], { z28.h-z29.h }, z5.h\n"
".inst 0xc1647789 // sdot za.s[x11, 1], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
".inst 0xc168776c // sdot za.s[x11, 4], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
".inst 0xc165778a // sdot za.s[x11, 2], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778b // sdot za.s[x11, 3], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0412ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
".inst 0xc168776e // sdot za.s[x11, 6], { z27.h-z28.h }, z8.h\n"
".inst 0xc160776f // sdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1402a60 // ld1h { z0.h, z8.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
".inst 0xc165778c // sdot za.s[x11, 4], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778d // sdot za.s[x11, 5], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0412aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
".inst 0xc16b77aa // sdot za.s[x11, 2], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ab // sdot za.s[x11, 3], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc165778e // sdot za.s[x11, 6], { z28.h-z29.h }, z5.h\n"
".inst 0xc164778f // sdot za.s[x11, 7], { z28.h-z29.h }, z4.h\n"
- ".inst 0xa0412a64 // ld1h { z4.h-z5.h }, pn10.b/Z, [x19, #0x2, MUL VL]\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
".inst 0xc16b77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77ad // sdot za.s[x11, 5], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc16b77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z11.h\n"
".inst 0xc16a77af // sdot za.s[x11, 7], { z29.h-z30.h }, z10.h\n"
- ".inst 0xa0422a6a // ld1h { z10.h-z11.h }, pn10.b/Z, [x19, #0x4, MUL VL]\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
".inst 0xc1681768 // sdot za.s[x8, 0], { z27.h-z28.h }, z8.h\n"
".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xc1651788 // sdot za.s[x8, 0], { z28.h-z29.h }, z5.h\n"
@@ -1135,20 +1135,20 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"22:" // Main loop skip tail
- "cbz x14, 24f\n"
+ "cbz x15, 24f\n"
"23:" // Right padding loop
".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
- "subs x14, x14, #0x1\n"
+ "subs x15, x15, #0x1\n"
".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
"add x11, x11, #0x2\n"
@@ -1157,44 +1157,44 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
".inst 0xc1a9ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z9.s\n"
".inst 0xc00408c1 // mova za.d[x8, #1], { z6.d-z7.d }\n"
".inst 0xc1bfcf0c // sclamp { z12.s-z15.s }, z24.s, z31.s\n"
- "st1b { z12.s }, p1, [x13]\n"
+ "st1b { z12.s }, p1, [x14]\n"
+ "add x14, x14, x3\n"
+ "st1b { z14.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z14.s }, p1, [x4]\n"
- "add x4, x4, x9\n"
- "st1b { z13.s }, p1, [x28]\n"
+ "st1b { z13.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z15.s }, p1, [x28]\n"
"add x28, x28, x26\n"
- "st1b { z15.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
"bgt 23b\n"
"24:" // End
"ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
"incw x23, ALL, MUL #16\n"
"incw x23, ALL, MUL #9\n"
"str x23, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
- "ldr x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x15, x15, x19\n"
- "str x15, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x5\n"
+ "whilelt p1.s, x5, x7\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x16, x16, x20\n"
+ "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 8cdc94d0e9..33bb4eb8ec 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,20 +69,20 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"ptrue p2.b\n"
- "mov x19, #0xb\n"
- "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
"ld1rh { z9.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x19, x19, x4\n"
+ "sub x20, x20, x3\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x6\n"
- "whilelt p9.s, XZR, x19\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
"ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x5\n"
+ "whilelt p8.s, XZR, x4\n"
"addvl SP, SP, #-15\n"
- "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"neg z9.h, p2/M, z9.h\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
@@ -90,310 +90,227 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
- "ldr x19, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
"mov z28.s, #0x0\n"
- "cbz x19, 2f\n"
- "ld1w { z28.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x19, x21\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"sub z12.h, z12.h, z18.h\n"
- "incw x21\n"
+ "incw x22\n"
"mov z14.h, #0x0\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "addvl x20, SP, #15\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, SP, #15\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
"sub z16.h, z16.h, z18.h\n"
- "incw x21\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "addvl x21, x21, #-3\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z17.h, z17.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "mov x19, x21\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
- "incw x21\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "incw x22\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z16.h, z16.h, z18.h\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"trn1 z10.h, z24.h, z17.h\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z12.h, z12.h, z18.h\n"
"sub z25.h, z25.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
"trn1 z0.h, z16.h, z14.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
- "incw x21\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
"sub z24.h, z24.h, z18.h\n"
"sub z17.h, z17.h, z18.h\n"
- "addvl x20, x20, #-3\n"
- "mov x19, x21\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z2.h }, p2, [x21]\n"
"sub z16.h, z16.h, z18.h\n"
"trn1 z2.h, z12.h, z25.h\n"
- "ld1sb { z12.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "ld1sb { z25.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z10.h, z24.h, z17.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "ld1sb { z24.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"trn1 z0.h, z16.h, z14.h\n"
"sub z12.h, z12.h, z18.h\n"
- "ld1sb { z17.s }, p2/Z, [x19]\n"
- "incw x19, ALL, MUL #5\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
"sub z25.h, z25.h, z18.h\n"
"sub z24.h, z24.h, z18.h\n"
- "ld1sb { z16.s }, p2/Z, [x19]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
"sub z17.h, z17.h, z18.h\n"
"sub z16.h, z16.h, z18.h\n"
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x20, x20, #-3\n"
- "st1h { z2.h }, p2, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z2.h }, p2, [x21]\n"
"mov z29.d, z28.d\n"
"mov z30.d, z28.d\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
"mov z31.d, z28.d\n"
"trn1 z2.h, z12.h, z25.h\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "addvl x20, x20, #-3\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
"trn1 z10.h, z24.h, z17.h\n"
"trn1 z0.h, z16.h, z14.h\n"
- "st1h { z2.h }, p2, [x20]\n"
- "st1h { z10.h }, p2, [x20, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x20, #2, MUL VL]\n"
- "cbz x19, 3f\n"
- "ld1w { z3.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "st1h { z2.h }, p2, [x21]\n"
+ "st1h { z10.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x20, x6, LSL #2]\n"
"3:" // Load mul: End
- "ldr x19, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
- "cbz x19, 4f\n"
- "ld1w { z1.s }, p1/Z, [x19, x7, LSL #2]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z1.s }, p1/Z, [x20, x6, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x19, x17, #0x1\n"
- "orr x22, x19, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x22, x6, x22, LSL #22\n"
- "mov x21, #0xb\n"
- "add x20, x5, x4\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
"mov x8, #0x0\n"
- "lsl x22, x22, #0x0\n"
- "sub x21, x21, x20\n"
- "madd x19, x19, x5, x16\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
"5:" // Issue prefetches
- "subs x21, x21, #0x1\n"
- ".inst 0xf8b64a7c // rprfm pldstrm, x22, [x19]\n"
- "add x19, x19, %x[ld_in_col]\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
"bgt 5b\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x19, %x[ld_in_row], #0x0\n"
- "msub x16, x5, x19, x16\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
- "mov x21, #0x4\n"
- "ldp x14, x13, [x24], #0x10\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x25], #0x10\n"
".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ldp x11, x10, [x19], #0x10\n"
+ "ldp x13, x11, [x20], #0x10\n"
".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ldp x9, x28, [x24], #0x10\n"
- "ldp x27, x26, [x19], #0x10\n"
- "cbz x20, 7f\n"
- "cmp x20, x21\n"
- "csel x19, x20, x21, LT\n"
- "sub x20, x20, x19\n"
- "sub x21, x21, x19\n"
- "cbz x20, 7f\n"
+ "ldp x10, x9, [x25], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
- "and x21, x20, #0x1\n"
+ "and x22, x21, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x20, x20, #0x1\n"
- "lsr x20, x20, #0x1\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
- "sub x15, x15, x20\n"
+ "sub x16, x16, x21\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
"6:" // Left padding
- "subs x20, x20, #0x1\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "subs x21, x21, #0x1\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x5, x4\n"
+ "adds XZR, x4, x3\n"
"bne 14f\n"
- "cbz x21, 12f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 11f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 10f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "9:" // Unpadded: 3 priming loads
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x19, SP, #9\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z11.h, z11.h, z21.h\n"
- "add z11.h, z11.h, z9.h\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z12.h, z12.h, z20.h\n"
- "add z12.h, z12.h, z9.h\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z13.h, z13.h, z19.h\n"
- "add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z14.h, z14.h, z18.h\n"
- "add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z15.h, z15.h, z17.h\n"
- "add z15.h, z15.h, z9.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "10:" // Unpadded: 2 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #6\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
"ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #12\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -407,40 +324,34 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "11:" // Unpadded: 1 priming loads
- "add x21, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "addvl x20, SP, #3\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
"ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "addvl x19, SP, #12\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
@@ -454,127 +365,100 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z17.s }, p1/Z, [x21]\n"
- "trn1 z15.h, z15.h, z17.h\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"mov z16.d, z16.d\n"
- ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"add z16.h, z16.h, z9.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "12:" // Unpadded: 0 priming loads
- "cmp x17, #0x2\n"
- ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "blt 22f\n"
- "add x20, x16, %x[ld_in_row]\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "sub x17, x17, #0x2\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z21.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "lsr x19, x17, #0x1\n"
"add z11.h, z11.h, z9.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "cmp x19, x15\n"
- "ld1b { z13.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "csel x25, x19, x15, LT\n"
"add z12.h, z12.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "and x17, x17, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z16.d, z16.d\n"
- "add z16.h, z16.h, z9.h\n"
- "sub x15, x15, x25\n"
- "cbz x25, 21f\n"
- "13:" // Unpadded: Main loop
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
- "add x22, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "addvl x20, SP, #9\n"
- "subs x25, x25, #0x1\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
- "add x16, x16, %x[ld_in_col]\n"
- "add x19, x16, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
"ld1b { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
"add z11.h, z11.h, z9.h\n"
"ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "addvl x20, SP, #9\n"
"ld1b { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
"add z12.h, z12.h, z9.h\n"
"ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add x17, x17, %x[ld_in_col]\n"
"ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z13.h, z13.h, z9.h\n"
"ld1b { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
"ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
"ld1b { z17.s }, p1/Z, [x22]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add x22, x22, %x[ld_in_row]\n"
"add z15.h, z15.h, z9.h\n"
".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"mov z16.d, z16.d\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
@@ -583,50 +467,166 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"add z16.h, z16.h, z9.h\n"
"ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "ld1b { z11.s }, p1/Z, [x16]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "ld1b { z21.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "ld1b { z12.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "ld1b { z20.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "sub x16, x16, #0x1\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z12.h, z12.h, z20.h\n"
- "add x28, x28, x26\n"
- "ld1b { z13.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "cmp x20, x16\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z15.h, z15.h, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ "add z16.h, z16.h, z9.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b22 // ld1h { z2.h, z10.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z11.h, z11.h, z9.h\n"
+ "ld1b { z12.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z12.h, z12.h, z9.h\n"
+ "ld1b { z13.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z19.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z19.h\n"
+ "add z13.h, z13.h, z9.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z18.h\n"
+ "add z14.h, z14.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, z15.h, z9.h\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1b { z11.s }, p1/Z, [x17]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z21.h\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add x9, x9, x27\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z11.h, z11.h, z9.h\n"
- "ld1b { z19.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"add z12.h, z12.h, z9.h\n"
- "ld1b { z14.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"add z13.h, z13.h, z9.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1b { z18.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z14.h, z14.h, z18.h\n"
"add z14.h, z14.h, z9.h\n"
- "ld1b { z15.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x19]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z15.h, z15.h, z17.h\n"
"add z15.h, z15.h, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x19]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
"mov z16.d, z16.d\n"
"add z16.h, z16.h, z9.h\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
@@ -634,717 +634,717 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
- "cbz x21, 19f\n"
- "cmp x21, #0x1\n"
- "sub x17, x17, x21\n"
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
"beq 18f\n"
- "cmp x21, #0x2\n"
+ "cmp x22, #0x2\n"
"beq 17f\n"
- "cmp x21, #0x3\n"
+ "cmp x22, #0x3\n"
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #12\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "add x21, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x20]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
+ "ld1b { z19.s }, p0/Z, [x21]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "add x21, x21, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x19, SP, #9\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #12\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "addvl x20, SP, #3\n"
+ "addvl x21, SP, #3\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
- "addvl x19, SP, #12\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "add x16, x16, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x17, #0x2\n"
+ "cmp x7, #0x2\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
- "sub x17, x17, #0x2\n"
- "sub x15, x15, #0x1\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "lsr x19, x17, #0x1\n"
- "cmp x19, x15\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
- "csel x24, x19, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "and x17, x17, #0x1\n"
- "sub x15, x15, x24\n"
- "cbz x24, 21f\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
"20:" // Padded: Main loop
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x19, x16, %x[ld_in_row]\n"
- "addvl x21, SP, #3\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x20, SP, #9\n"
- "subs x24, x24, #0x1\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"mov x12, #0x4\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"add z17.h, p0/M, z17.h, z9.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"trn1 z15.h, z15.h, z17.h\n"
"mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "addvl x23, SP, #6\n"
- "addvl x22, SP, #12\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xa1402b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
- "add x21, x16, %x[ld_in_row]\n"
- "addvl x20, SP, #3\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
- "addvl x19, SP, #9\n"
+ ".inst 0xa1402ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
+ "ld1b { z21.s }, p0/Z, [x22]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
- "ld1b { z12.s }, p0/Z, [x21]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"add z20.h, p0/M, z20.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"trn1 z11.h, z11.h, z21.h\n"
- "ld1b { z13.s }, p0/Z, [x21]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x21, x21, %x[ld_in_row]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
"trn1 z12.h, z12.h, z20.h\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z15.h, z15.h, z17.h\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add z16.h, p0/M, z16.h, z9.h\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
- "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
- "st1b { z6.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"ld1h { z0.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
+ "st1b { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
"22:" // Main loop skip tail
- "cbz x17, 23f\n" // Skip remainder inputs
+ "cbz x7, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z11.s }, p0/Z, [x16]\n"
+ "ld1b { z11.s }, p0/Z, [x17]\n"
"add z11.h, p0/M, z11.h, z9.h\n"
- "add x19, x16, %x[ld_in_row]\n"
+ "add x20, x17, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x19]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
"add z21.h, p0/M, z21.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z12.s }, p0/Z, [x19]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add z12.h, p0/M, z12.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x19]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
"add z20.h, p0/M, z20.h, z9.h\n"
"mov x12, #0x4\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
"trn1 z11.h, z11.h, z21.h\n"
"trn1 z12.h, z12.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z13.s }, p0/Z, [x19]\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add z13.h, p0/M, z13.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x19]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
"add z19.h, p0/M, z19.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z14.s }, p0/Z, [x19]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add z14.h, p0/M, z14.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x19]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
"add z18.h, p0/M, z18.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x19]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add z15.h, p0/M, z15.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x19]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add z17.h, p0/M, z17.h, z9.h\n"
- "add x19, x19, %x[ld_in_row]\n"
+ "add x20, x20, %x[ld_in_row]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"trn1 z13.h, z13.h, z19.h\n"
"trn1 z14.h, z14.h, z18.h\n"
- "ld1b { z16.s }, p0/Z, [x19]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add z16.h, p0/M, z16.h, z9.h\n"
"trn1 z15.h, z15.h, z17.h\n"
- "addvl x20, SP, #6\n"
+ "addvl x21, SP, #6\n"
".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"mov z16.d, z16.d\n"
- "addvl x19, SP, #12\n"
- "sub x15, x15, #0x1\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
- ".inst 0xa1402a62 // ld1h { z2.h, z10.h }, pn10.b/Z, [x19]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
".inst 0xc172156a // sdot za.s[x8, 2], { z11.h-z14.h }, z2.h\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc17a158a // sdot za.s[x8, 2], { z12.h-z15.h }, z10.h\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x19, #2, MUL VL]\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"23:" // Tail input: End
- "cbz x15, 25f\n"
+ "cbz x16, 25f\n"
"24:" // Right padding loop
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
".inst 0xc1a3ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n"
"add x8, x8, #0x1\n"
".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
- "subs x15, x15, #0x1\n"
+ "subs x16, x16, #0x1\n"
".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
".inst 0xc1b7cf44 // sclamp { z4.s-z7.s }, z26.s, z23.s\n"
- "st1b { z4.s }, p1, [x14]\n"
+ "st1b { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z5.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z5.s }, p1, [x13]\n"
- "add x13, x13, x10\n"
- "st1b { z6.s }, p1, [x9]\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z7.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z7.s }, p1, [x28]\n"
- "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x21, ALL, MUL #16\n"
- "incw x21, ALL, MUL #9\n"
- "str x21, [%x[args], %[offsetof_Args_weights]]\n"
- "ldr x19, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x7\n"
- "whilelt p1.s, x7, x6\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "add x16, x16, x19\n"
- "str x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "ldr x24, [%x[args], %[offsetof_Args_outptrs]]\n"
- "ldr x23, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
- "ldp x22, x21, [x24, #0x0]\n"
- "ldp x20, x19, [x23, #0x0]\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x22, ALL, MUL #16\n"
+ "incw x22, ALL, MUL #9\n"
+ "str x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x17, x17, x20\n"
+ "str x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x0]\n"
- "ldp x22, x21, [x24, #0x10]\n"
- "ldp x20, x19, [x23, #0x10]\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
"add x22, x22, x20\n"
- "add x21, x21, x19\n"
- "stp x22, x21, [x24, #0x10]\n"
+ "stp x23, x22, [x25, #0x10]\n"
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 955a02de57..9fd220abf8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,225 +88,225 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x14, #0x0\n"
"1:" // Tile loop
- "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x2\n"
- "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x15, #0x2\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x13, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cnth x12\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x12\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x17, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x16, x11, x19\n" // offset += tile_j * ld_input_col
- "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
- "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x10, x10, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x27, x10, x22, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x26, x27, x22, LSL #1\n"
- "ld1h { z16.h }, p3/Z, [x14]\n"
- "add x25, x26, x22, LSL #1\n"
- "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n"
- "add x24, x11, x11\n"
- "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n"
- "add x23, x24, x11\n"
- "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n"
- "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
- "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n"
- "madd x19, x16, x9, x19\n" // offset += tile_j * ld_output_col
- "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n"
- "mul x19, x19, x15\n" // offset *= output_tile_size
- "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n"
- "add x28, x28, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n"
- "add x22, x28, x20, LSL #1\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
+ "cnth x11\n"
+ "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z9.h }, p2/Z, [x27, x11, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x10]\n"
- "addvl x14, x14, #16\n"
- "ld1h { z11.h }, p2/Z, [x10, x23, LSL #1]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n"
- "addvl x14, x14, #-6\n"
- "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x11, LSL #1]\n"
+ "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ld1h { z18.h }, p3/Z, [x10]\n"
+ "add x27, x13, x13\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "add x26, x9, x23, LSL #1\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "add x25, x26, x23, LSL #1\n"
+ "add x24, x27, x13\n"
+ "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x11, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "add x22, x28, x22, LSL #1\n"
+ "mov x21, #0x0\n"
+ "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "sub x20, XZR, x11\n"
+ "ld1h { z10.h }, p2/Z, [x9]\n"
+ "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "addvl x10, x10, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z4.h, z9.h\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z3.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "whilelt p1.h, x11, %x[n_channels]\n"
"inch x21\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23]\n"
+ "inch x11\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "inch x13\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n"
- "inch x12\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "ld1h { z16.h }, p3/Z, [x14]\n"
+ "ld1h { z18.h }, p3/Z, [x10]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "inch x20\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26]\n"
"fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z12.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n"
- "addvl x10, x10, #1\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n"
- "addvl x27, x27, #1\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26]\n"
- "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
"addvl x26, x26, #1\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "ld1h { z13.h }, p1/Z, [x26, x11, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z10.h\n"
- "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmla z29.h, p3/M, z7.h, z11.h\n"
- "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z31.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x27, x11, LSL #1]\n"
- "cmp x12, %x[n_channels]\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x10]\n"
- "fmla z28.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x10, x23, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "addvl x14, x14, #16\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x27, x24, LSL #1]\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n"
- "addvl x14, x14, #-6\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x28]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "cmp x11, %x[n_channels]\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "ld1h { z10.h }, p1/Z, [x9]\n"
+ "ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
+ "st1h { z28.h }, p0, [x28]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "st1h { z30.h }, p0, [x22]\n"
"addvl x28, x28, #1\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z29.h }, p0, [x22]\n"
- "st1h { z28.h }, p0, [x22, x9, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "addvl x10, x10, #-6\n"
+ "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z4.h, z9.h\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z3.h, z9.h\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x17, #0x1\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x20\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "add x21, x10, #0x1\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x25]\n"
- "add x16, x16, #0x1\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "cmp x16, x19\n"
- "fmla z31.h, p3/M, z5.h, z12.h\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "csel x16, x16, XZR, LT\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "csel x10, x10, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26]\n"
+ "csel x14, x14, XZR, LT\n"
"fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n"
- "csel x17, x17, x21, LT\n"
- "fmla z29.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n"
- "cmp x17, x20\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z9.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x28]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z29.h }, p0, [x22]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z28.h }, p0, [x22, x9, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "cmp x10, x20\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "st1h { z28.h }, p0, [x28]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z29.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x12, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 7cca6fbcbf..9242b470c3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,215 +78,215 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cnth x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cnth x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z16.h }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldr x22, [x14, #0x20]\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x16]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z4.h, z9.h\n"
- "ldr x21, [x14, #0x28]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z3.h, z9.h\n"
- "ldr x20, [x14, #0x30]\n"
- "inch x9\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "ldr x19, [x14, #0x38]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ldr x26, [x14, #0x40]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ldr x25, [x14, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "ldr x24, [x14, #0x50]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z12.h\n"
- "ldr x23, [x14, #0x58]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
"fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z16.h }, p3/Z, [x15]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ldr x22, [x14, #0x20]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z8.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z30.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "ld1h { z13.h }, p1/Z, [x22, x12, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z10.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "inch x13\n"
- "fmla z29.h, p3/M, z7.h, z11.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmla z31.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "addvl x15, x15, #16\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
- "inch x12\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "inch x28\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "mov p0.b, p2.b\n"
+ "ld1h { z18.h }, p3/Z, [x16]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "inch x9\n"
+ "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "inch x14\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "whilelt p2.h, x9, %x[n_channels]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z4.h, z9.h\n"
- "ldr x21, [x14, #0x28]\n"
- "inch x9\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z3.h, z9.h\n"
- "ldr x20, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "ldr x19, [x14, #0x38]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ldr x26, [x14, #0x40]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ldr x25, [x14, #0x48]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldr x24, [x14, #0x50]\n"
- "fmla z31.h, p3/M, z5.h, z12.h\n"
- "ldr x23, [x14, #0x58]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z9.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ldr x19, [x14, #0x78]\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
"fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z9.h\n"
- "fmla z28.h, p3/M, z6.h, z11.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 4126cefa34..d2dae84089 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,369 +88,369 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x6, #0x0\n"
- "mov x7, #0x0\n"
+ "mov x13, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x3\n"
"mov x24, #0x3\n"
- "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x23, #0x3\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x17, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cnth x16\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x16\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x15\n"
+ "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
"ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x6, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x7, x15, x19\n" // offset += tile_j * ld_input_col
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
- "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x14, x14, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x11, x14, x22, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x10, x11, x22, LSL #1\n"
- "ld1h { z16.h }, p3/Z, [x8]\n"
- "add x9, x10, x22, LSL #1\n"
- "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
- "add x28, x9, x22, LSL #1\n"
- "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
- "add x27, x15, x15\n"
- "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
- "add x26, x27, x15\n"
- "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
- "add x25, x26, x15\n"
- "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
- "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
- "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
- "madd x19, x7, x13, x19\n" // offset += tile_j * ld_output_col
- "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
- "mul x19, x19, x23\n" // offset *= output_tile_size
- "add x24, x13, x13\n"
- "add x12, x12, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "add x23, x12, x20, LSL #1\n"
- "add x22, x23, x20, LSL #1\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x12, x17, x17\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x14, x23, LSL #1\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x10, x23, LSL #1\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z9.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x13]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+ "add x28, x9, x23, LSL #1\n"
+ "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+ "add x27, x12, x17\n"
+ "add x11, x11, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+ "add x26, x28, x23, LSL #1\n"
+ "add x25, x27, x17\n"
+ "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "add x24, x11, x21, LSL #1\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "add x23, x24, x21, LSL #1\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+ "add x22, x16, x16\n"
+ "mov x21, #0x0\n"
+ "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
+ "sub x20, XZR, x15\n"
"ld1h { z10.h }, p2/Z, [x14]\n"
- "addvl x8, x8, #16\n"
"ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "addvl x8, x8, #-6\n"
- "ld1h { z12.h }, p2/Z, [x28]\n"
- "ld1h { z13.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "addvl x13, x13, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x26]\n"
+ "ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x16, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z7.h, z9.h\n"
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x15, %x[n_channels]\n"
"inch x21\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "inch x15\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z5.h, z9.h\n"
- "inch x17\n"
- "movprfx z27, z16\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "inch x16\n"
- "movprfx z26, z16\n fmla z26.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z16\n fmla z25.h, p3/M, z2.h, z9.h\n"
- "movprfx z24, z16\n fmla z24.h, p3/M, z1.h, z9.h\n"
- "movprfx z23, z16\n fmla z23.h, p3/M, z0.h, z9.h\n"
- "ld1h { z16.h }, p3/Z, [x8]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "inch x20\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z26.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z18.h }, p3/Z, [x13]\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28]\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
"fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z2.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
"fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmla z26.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n"
- "addvl x11, x11, #1\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
"fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z11.h\n"
- "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "addvl x10, x10, #1\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
"addvl x14, x14, #1\n"
- "fmla z24.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
"ld1h { z10.h }, p1/Z, [x14]\n"
- "fmla z23.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
- "addvl x9, x9, #1\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
"fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10]\n"
+ "ld1h { z12.h }, p2/Z, [x9]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "addvl x9, x9, #1\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
"fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n"
- "addvl x10, x10, #1\n"
"fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z9.h }, p1/Z, [x10, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z5.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "whilelt p2.h, x17, %x[n_channels]\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
- "addvl x28, x28, #1\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmla z25.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x28]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z17.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "fmax z26.h, p3/M, z26.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z25.h, p3/M, z25.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "addvl x26, x26, #1\n"
+ "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
"ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
- "fmla z24.h, p3/M, z7.h, z13.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z25.h, p3/M, z8.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmla z23.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x11, x27, LSL #1]\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "addvl x8, x8, #-6\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z31.h }, p0, [x12]\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z30.h }, p0, [x12, x13, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z18.h\n"
- "fmax z26.h, p3/M, z26.h, z18.h\n"
- "st1h { z29.h }, p0, [x12, x24, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "addvl x12, x12, #1\n"
- "fmax z25.h, p3/M, z25.h, z18.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "fmin z27.h, p3/M, z27.h, z17.h\n"
- "fmin z26.h, p3/M, z26.h, z17.h\n"
- "st1h { z27.h }, p0, [x23, x13, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z17.h\n"
- "fmax z24.h, p3/M, z24.h, z18.h\n"
- "st1h { z26.h }, p0, [x23, x24, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x26]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
+ "st1h { z23.h }, p0, [x11]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+ "st1h { z26.h }, p0, [x24]\n"
+ "addvl x13, x13, #-6\n"
+ "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z29.h }, p0, [x23]\n"
+ "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "st1h { z25.h }, p0, [x22]\n"
- "fmin z24.h, p3/M, z24.h, z17.h\n"
- "fmin z23.h, p3/M, z23.h, z17.h\n"
- "st1h { z24.h }, p0, [x22, x13, LSL #1]\n"
- "st1h { z23.h }, p0, [x22, x24, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x6, #0x1\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x8, x8, #0x1\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "cmp x8, x20\n"
+ "add x21, x13, #0x1\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z5.h, z9.h\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x7, x7, #0x1\n"
- "movprfx z27, z16\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "cmp x7, x19\n"
- "movprfx z26, z16\n fmla z26.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z16\n fmla z25.h, p3/M, z2.h, z9.h\n"
- "csel x7, x7, XZR, LT\n"
- "movprfx z24, z16\n fmla z24.h, p3/M, z1.h, z9.h\n"
- "csel x6, x6, x21, LT\n"
- "movprfx z23, z16\n fmla z23.h, p3/M, z0.h, z9.h\n"
- "cmp x6, x20\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "csel x13, x13, x21, LT\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "csel x8, x8, XZR, LT\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "cmp x13, x20\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z26.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28]\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
"fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9]\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z2.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
"fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmla z26.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
"fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z11.h\n"
- "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z13.h\n"
- "fmla z23.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
"fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10]\n"
+ "ld1h { z12.h }, p2/Z, [x9]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
"fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n"
"fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z5.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z12.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z25.h, p3/M, z8.h, z13.h\n"
- "fmla z24.h, p3/M, z7.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmla z23.h, p3/M, z6.h, z13.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x12]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x12, x13, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmax z27.h, p3/M, z27.h, z18.h\n"
- "st1h { z29.h }, p0, [x12, x24, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z18.h\n"
- "fmax z25.h, p3/M, z25.h, z18.h\n"
- "fmax z24.h, p3/M, z24.h, z18.h\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "fmin z27.h, p3/M, z27.h, z17.h\n"
- "fmin z26.h, p3/M, z26.h, z17.h\n"
- "st1h { z27.h }, p0, [x23, x13, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z17.h\n"
- "fmin z24.h, p3/M, z24.h, z17.h\n"
- "st1h { z26.h }, p0, [x23, x24, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "st1h { z25.h }, p0, [x22]\n"
- "fmin z23.h, p3/M, z23.h, z17.h\n"
- "st1h { z24.h }, p0, [x22, x13, LSL #1]\n"
- "st1h { z23.h }, p0, [x22, x24, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z17.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "fmax z26.h, p3/M, z26.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z25.h, p3/M, z25.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z23.h }, p0, [x11]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z24.h }, p0, [x11, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z25.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z26.h }, p0, [x24]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z27.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z28.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z29.h }, p0, [x23]\n"
+ "st1h { z30.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index f79a36b2a3..59c0e0cf0b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,387 +87,387 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cnth x12\n"
- "ld1h { z16.h }, p3/Z, [x15]\n"
- "sub x11, XZR, x12\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z18.h }, p3/Z, [x17]\n"
+ "cnth x15\n"
+ "mov x14, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1h { z9.h }, p2/Z, [x10, x13, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "ldr x26, [x14, #0x20]\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "sub x13, XZR, x15\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "ldr x28, [x16, #0x20]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x28, x14, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x25, [x14, #0x28]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x24, [x14, #0x30]\n"
- "inch x11\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x23, [x14, #0x38]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z5.h, z9.h\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z27, z16\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z26, z16\n fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z25, z16\n fmla z25.h, p3/M, z2.h, z9.h\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z24, z16\n fmla z24.h, p3/M, z1.h, z9.h\n"
- "ldr x26, [x14, #0x60]\n"
- "movprfx z23, z16\n fmla z23.h, p3/M, z0.h, z9.h\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ldr x21, [x16, #0x8]\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "ldr x25, [x16, #0x38]\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ldr x24, [x16, #0x28]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ldr x28, [x16, #0x60]\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "inch x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z26.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z30.h, p3/M, z6.h, z11.h\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "ldr x19, [x16, #0x18]\n"
+ "ldr x23, [x27, #0x0]\n"
+ "whilelt p0.h, x15, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "ldr x24, [x16, #0x68]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z3.h, z11.h\n"
- "ld1h { z16.h }, p3/Z, [x15]\n"
- "fmla z25.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x14, #0x98]\n"
+ "ldr x22, [x27, #0x8]\n"
+ "ldr x21, [x27, #0x10]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ldr x20, [x27, #0x18]\n"
+ "ld1h { z18.h }, p3/Z, [x17]\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
"fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z24.h, p3/M, z2.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z3.h, z12.h\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x13, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "ldr x28, [x16, #0xa0]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
"fmla z27.h, p3/M, z7.h, z10.h\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z26.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
"fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z11.h\n"
- "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z13.h\n"
- "ldr x26, [x14, #0x20]\n"
- "fmla z23.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ldr x28, [x16, #0x20]\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
"fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
"fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x27, #0x20]\n"
+ "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
"fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z5.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x13, LSL #1]\n"
- "inch x13\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "fmla z25.h, p3/M, z0.h, z12.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "ld1h { z9.h }, p1/Z, [x10, x12, LSL #1]\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "ld1h { z10.h }, p1/Z, [x9, x12, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x28, x12, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z13.h\n"
- "ld1h { z12.h }, p1/Z, [x27, x12, LSL #1]\n"
- "fmla z24.h, p3/M, z7.h, z13.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z23.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x26, x12, LSL #1]\n"
- "inch x12\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmax z27.h, p3/M, z27.h, z18.h\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "st1h { z31.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "ldr x22, [x16, #0x20]\n"
- "fmax z26.h, p3/M, z26.h, z18.h\n"
- "st1h { z30.h }, p0, [x21, x11, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "fmin z27.h, p3/M, z27.h, z17.h\n"
- "st1h { z29.h }, p0, [x20, x11, LSL #1]\n"
- "fmin z26.h, p3/M, z26.h, z17.h\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z25.h, p3/M, z25.h, z18.h\n"
- "ldr x20, [x16, #0x30]\n"
- "fmax z24.h, p3/M, z24.h, z18.h\n"
- "st1h { z28.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "st1h { z27.h }, p0, [x22, x11, LSL #1]\n"
- "st1h { z26.h }, p0, [x21, x11, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z17.h\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z24.h, p3/M, z24.h, z17.h\n"
- "ldr x22, [x16, #0x40]\n"
- "fmin z23.h, p3/M, z23.h, z17.h\n"
- "st1h { z25.h }, p0, [x20, x11, LSL #1]\n"
- "st1h { z24.h }, p0, [x19, x11, LSL #1]\n"
- "st1h { z23.h }, p0, [x22, x11, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "fmax z25.h, p3/M, z25.h, z17.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmax z26.h, p3/M, z26.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "inch x14\n"
+ "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
+ "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "ld1h { z13.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "inch x15\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x22, [x27, #0x28]\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x21, [x27, #0x30]\n"
+ "ldr x20, [x27, #0x38]\n"
+ "ldr x23, [x27, #0x40]\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x25, [x14, #0x28]\n"
- "inch x11\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x24, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z5.h, z9.h\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z27, z16\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z26, z16\n fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z25, z16\n fmla z25.h, p3/M, z2.h, z9.h\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z24, z16\n fmla z24.h, p3/M, z1.h, z9.h\n"
- "ldr x26, [x14, #0x60]\n"
- "movprfx z23, z16\n fmla z23.h, p3/M, z0.h, z9.h\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ldr x21, [x16, #0x8]\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "ldr x25, [x16, #0x38]\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ldr x24, [x16, #0x28]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ldr x28, [x16, #0x60]\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "inch x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z26.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z30.h, p3/M, z6.h, z11.h\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "ldr x19, [x16, #0x18]\n"
+ "ldr x23, [x27, #0x0]\n"
+ "ldr x22, [x27, #0x8]\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "ldr x24, [x16, #0x68]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ldr x27, [x14, #0x98]\n"
+ "ldr x21, [x27, #0x10]\n"
+ "ldr x20, [x27, #0x18]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
"fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z24.h, p3/M, z2.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z3.h, z12.h\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x13, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "ldr x28, [x16, #0xa0]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
"fmla z27.h, p3/M, z7.h, z10.h\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z26.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z4.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
"fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z11.h\n"
- "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
"fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z13.h\n"
- "fmla z23.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
"fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmax z24.h, p3/M, z24.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z23.h }, p1, [x23, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
"fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x27, #0x20]\n"
+ "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
"fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z13.h\n"
- "fmla z24.h, p3/M, z5.h, z13.h\n"
- "fmla z23.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z12.h\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z25.h, p3/M, z8.h, z13.h\n"
- "fmla z24.h, p3/M, z7.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmla z23.h, p3/M, z6.h, z13.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "ldr x22, [x16, #0x20]\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z30.h }, p0, [x21, x11, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z18.h\n"
- "fmax z26.h, p3/M, z26.h, z18.h\n"
- "st1h { z29.h }, p0, [x20, x11, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z25.h, p3/M, z25.h, z18.h\n"
- "ldr x20, [x16, #0x30]\n"
- "fmax z24.h, p3/M, z24.h, z18.h\n"
- "st1h { z28.h }, p0, [x19, x11, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z17.h\n"
- "fmin z26.h, p3/M, z26.h, z17.h\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z25.h, p3/M, z25.h, z17.h\n"
- "st1h { z27.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z17.h\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "st1h { z26.h }, p0, [x21, x11, LSL #1]\n"
- "st1h { z25.h }, p0, [x20, x11, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z17.h\n"
- "st1h { z24.h }, p0, [x19, x11, LSL #1]\n"
- "ldr x22, [x16, #0x40]\n"
- "st1h { z23.h }, p0, [x22, x11, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z25.h, p3/M, z25.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z17.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z25.h }, p1, [x21, x13, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z17.h\n"
+ "fmax z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x22, [x27, #0x28]\n"
+ "fmax z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z27.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x21, [x27, #0x30]\n"
+ "ldr x20, [x27, #0x38]\n"
+ "ldr x23, [x27, #0x40]\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "st1h { z28.h }, p1, [x22, x13, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z29.h }, p1, [x21, x13, LSL #1]\n"
+ "st1h { z30.h }, p1, [x20, x13, LSL #1]\n"
+ "st1h { z31.h }, p1, [x23, x13, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 3cfac06449..c0b9137f6b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,565 +88,565 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x2, #0x0\n"
- "mov x3, #0x0\n"
+ "mov x16, #0x0\n"
+ "mov x4, #0x0\n"
"1:" // Tile loop
- "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
"mov x24, #0x4\n"
- "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x23, #0x4\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x5, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cnth x6\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x6\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
+ "add x7, x5, x5\n"
+ "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
"ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
- "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x15, x8, x22, LSL #1\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x14, x15, x22, LSL #1\n"
- "ld1h { z13.h }, p3/Z, [x4]\n"
- "add x13, x14, x22, LSL #1\n"
- "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n"
- "add x12, x13, x22, LSL #1\n"
- "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n"
- "add x11, x12, x22, LSL #1\n"
- "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n"
- "add x10, x7, x7\n"
- "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n"
- "add x9, x10, x7\n"
- "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n"
- "add x28, x9, x7\n"
- "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n"
- "add x27, x28, x7\n"
- "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n"
- "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
- "add x26, x17, x17\n"
- "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x23\n" // offset *= output_tile_size
- "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "add x25, x16, x20, LSL #1\n"
- "add x24, x25, x20, LSL #1\n"
- "add x23, x24, x20, LSL #1\n"
- "add x22, x26, x17\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "cnth x16\n"
+ "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x7, x5\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z9.h }, p2/Z, [x14, x10, LSL #1]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x13, x8, x23, LSL #1\n"
+ "ld1h { z15.h }, p3/Z, [x17]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "add x12, x13, x23, LSL #1\n"
+ "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "add x11, x12, x23, LSL #1\n"
+ "add x10, x14, x5\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "add x9, x15, x22, LSL #1\n"
+ "add x28, x11, x23, LSL #1\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "add x27, x10, x5\n"
+ "add x26, x9, x22, LSL #1\n"
+ "add x25, x6, x6\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x16, %x[n_channels]\n"
+ "add x24, x28, x23, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "add x23, x26, x22, LSL #1\n"
+ "add x22, x25, x6\n"
+ "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
"ld1h { z10.h }, p2/Z, [x8]\n"
- "addvl x4, x4, #16\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x16\n"
"ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n"
- "cmp x6, %x[n_channels]\n"
- "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n"
- "addvl x4, x4, #-6\n"
- "ld1h { z12.h }, p2/Z, [x14, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z13\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x6, %x[n_channels]\n"
- "movprfx z30, z13\n fmla z30.h, p3/M, z7.h, z9.h\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x16, %x[n_channels]\n"
"inch x21\n"
- "movprfx z29, z13\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "inch x16\n"
"mov p0.b, p2.b\n"
- "movprfx z27, z13\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "inch x5\n"
- "movprfx z26, z13\n fmla z26.h, p3/M, z4.h, z9.h\n"
- "inch x6\n"
- "movprfx z25, z13\n fmla z25.h, p3/M, z3.h, z9.h\n"
- "movprfx z23, z13\n fmla z23.h, p3/M, z2.h, z9.h\n"
- "movprfx z22, z13\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "movprfx z21, z13\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z26.h, p3/M, z5.h, z12.h\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "movprfx z24, z13\n fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z2.h, z12.h\n"
- "fmla z21.h, p3/M, z1.h, z12.h\n"
- "movprfx z20, z13\n fmla z20.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n"
- "movprfx z19, z13\n fmla z19.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n"
- "movprfx z16, z13\n fmla z16.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z9.h\n"
- "fmla z26.h, p3/M, z7.h, z9.h\n"
- "fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z5.h, z9.h\n"
- "fmla z22.h, p3/M, z4.h, z9.h\n"
- "fmla z21.h, p3/M, z3.h, z9.h\n"
- "fmla z19.h, p3/M, z2.h, z9.h\n"
- "movprfx z18, z13\n fmla z18.h, p3/M, z1.h, z9.h\n"
- "movprfx z17, z13\n fmla z17.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x15]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z13.h }, p3/Z, [x4]\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z10.h\n"
- "fmla z22.h, p3/M, z5.h, z10.h\n"
- "fmla z21.h, p3/M, z4.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "inch x20\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
"fmla z16.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "fmla z27.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z11.h\n"
- "fmla z19.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z11.h\n"
- "fmla z16.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n"
- "fmla z19.h, p3/M, z7.h, z11.h\n"
- "fmla z18.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z10.h\n"
- "fmla z26.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
"fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z17.h, p3/M, z8.h, z11.h\n"
- "fmla z16.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
"fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
"addvl x8, x8, #1\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14]\n"
- "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
"fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z4.h, z11.h\n"
- "fmla z22.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "fmla z18.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "addvl x14, x14, #1\n"
- "fmla z31.h, p3/M, z6.h, z10.h\n"
- "ld1h { z9.h }, p1/Z, [x14, x10, LSL #1]\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
"fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z24.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z5.h, z11.h\n"
- "fmla z20.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z11.h\n"
- "fmla z16.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "addvl x24, x24, #1\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
"addvl x13, x13, #1\n"
- "fmla z27.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z19.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z11.h\n"
- "fmla z19.h, p3/M, z5.h, z11.h\n"
- "fmla z18.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z12.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
- "fmla z16.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n"
- "addvl x11, x11, #1\n"
- "fmla z19.h, p3/M, z8.h, z10.h\n"
- "fmla z18.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z21.h, p3/M, z7.h, z11.h\n"
- "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmax z16.h, p3/M, z16.h, z14.h\n"
+ "fmax z17.h, p3/M, z17.h, z14.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z20.h, p3/M, z20.h, z14.h\n"
"fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z17.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n"
- "addvl x15, x15, #1\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmax z18.h, p3/M, z18.h, z14.h\n"
+ "fmax z19.h, p3/M, z19.h, z14.h\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
+ "fmax z22.h, p3/M, z22.h, z14.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
"fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z7.h, z12.h\n"
- "fmla z16.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z1.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n"
- "whilelt p2.h, x5, %x[n_channels]\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n"
- "addvl x12, x12, #1\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "cmp x6, %x[n_channels]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n"
- "fmla z19.h, p3/M, z4.h, z12.h\n"
- "fmla z18.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x14, x9, LSL #1]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n"
- "fmla z20.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z10.h\n"
- "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
+ "fmax z31.h, p3/M, z31.h, z14.h\n"
+ "ld1h { z15.h }, p3/Z, [x17]\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z16.h, p3/M, z16.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "fmin z18.h, p3/M, z18.h, z13.h\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "fmin z19.h, p3/M, z19.h, z13.h\n"
+ "fmin z20.h, p3/M, z20.h, z13.h\n"
+ "fmin z21.h, p3/M, z21.h, z13.h\n"
+ "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
"ld1h { z10.h }, p1/Z, [x8]\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n"
- "addvl x4, x4, #16\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
- "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n"
- "addvl x4, x4, #-6\n"
- "fmin z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z31.h }, p0, [x16]\n"
- "fmin z30.h, p3/M, z30.h, z14.h\n"
- "fmin z29.h, p3/M, z29.h, z14.h\n"
- "st1h { z30.h }, p0, [x16, x17, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "st1h { z29.h }, p0, [x16, x26, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "fmin z28.h, p3/M, z28.h, z14.h\n"
- "st1h { z28.h }, p0, [x16, x22, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z14.h\n"
- "addvl x16, x16, #1\n"
- "fmin z26.h, p3/M, z26.h, z14.h\n"
- "st1h { z27.h }, p0, [x25]\n"
- "fmin z25.h, p3/M, z25.h, z14.h\n"
- "fmin z24.h, p3/M, z24.h, z14.h\n"
- "st1h { z26.h }, p0, [x25, x17, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "st1h { z25.h }, p0, [x25, x26, LSL #1]\n"
- "fmax z22.h, p3/M, z22.h, z15.h\n"
- "fmax z21.h, p3/M, z21.h, z15.h\n"
- "st1h { z24.h }, p0, [x25, x22, LSL #1]\n"
- "addvl x25, x25, #1\n"
- "fmin z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z23.h }, p0, [x24]\n"
- "fmin z22.h, p3/M, z22.h, z14.h\n"
- "fmin z21.h, p3/M, z21.h, z14.h\n"
- "st1h { z22.h }, p0, [x24, x17, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "fmax z19.h, p3/M, z19.h, z15.h\n"
- "st1h { z21.h }, p0, [x24, x26, LSL #1]\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "fmax z16.h, p3/M, z16.h, z15.h\n"
- "fmin z20.h, p3/M, z20.h, z14.h\n"
- "st1h { z20.h }, p0, [x24, x22, LSL #1]\n"
- "fmin z19.h, p3/M, z19.h, z14.h\n"
- "addvl x24, x24, #1\n"
- "fmin z18.h, p3/M, z18.h, z14.h\n"
- "st1h { z19.h }, p0, [x23]\n"
- "fmin z17.h, p3/M, z17.h, z14.h\n"
- "fmin z16.h, p3/M, z16.h, z14.h\n"
- "st1h { z18.h }, p0, [x23, x17, LSL #1]\n"
- "st1h { z17.h }, p0, [x23, x26, LSL #1]\n"
- "st1h { z16.h }, p0, [x23, x22, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z13.h\n"
+ "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z13.h\n"
+ "fmin z25.h, p3/M, z25.h, z13.h\n"
+ "st1h { z16.h }, p0, [x15]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "fmin z26.h, p3/M, z26.h, z13.h\n"
+ "fmin z27.h, p3/M, z27.h, z13.h\n"
+ "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "fmin z28.h, p3/M, z28.h, z13.h\n"
+ "fmin z29.h, p3/M, z29.h, z13.h\n"
+ "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z13.h\n"
+ "fmin z31.h, p3/M, z31.h, z13.h\n"
+ "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
+ "addvl x28, x28, #1\n"
+ "st1h { z20.h }, p0, [x9]\n"
+ "addvl x15, x15, #1\n"
+ "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
+ "addvl x17, x17, #-6\n"
+ "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
+ "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "st1h { z24.h }, p0, [x26]\n"
+ "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z13\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z13\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x2, #0x1\n"
- "movprfx z29, z13\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x4, x4, #0x1\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "cmp x4, x20\n"
+ "add x21, x16, #0x1\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z27, z13\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x3, x3, #0x1\n"
- "movprfx z26, z13\n fmla z26.h, p3/M, z4.h, z9.h\n"
- "cmp x3, x19\n"
- "movprfx z25, z13\n fmla z25.h, p3/M, z3.h, z9.h\n"
- "movprfx z23, z13\n fmla z23.h, p3/M, z2.h, z9.h\n"
- "csel x3, x3, XZR, LT\n"
- "movprfx z22, z13\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "csel x2, x2, x21, LT\n"
- "movprfx z21, z13\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n"
- "cmp x2, x20\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11]\n"
- "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z26.h, p3/M, z5.h, z12.h\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "movprfx z24, z13\n fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z2.h, z12.h\n"
- "fmla z21.h, p3/M, z1.h, z12.h\n"
- "movprfx z20, z13\n fmla z20.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n"
- "movprfx z19, z13\n fmla z19.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n"
- "movprfx z16, z13\n fmla z16.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z9.h\n"
- "fmla z26.h, p3/M, z7.h, z9.h\n"
- "fmla z25.h, p3/M, z6.h, z9.h\n"
- "fmla z23.h, p3/M, z5.h, z9.h\n"
- "fmla z22.h, p3/M, z4.h, z9.h\n"
- "fmla z21.h, p3/M, z3.h, z9.h\n"
- "fmla z19.h, p3/M, z2.h, z9.h\n"
- "movprfx z18, z13\n fmla z18.h, p3/M, z1.h, z9.h\n"
- "movprfx z17, z13\n fmla z17.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x15]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z10.h\n"
- "fmla z22.h, p3/M, z5.h, z10.h\n"
- "fmla z21.h, p3/M, z4.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "csel x16, x16, x21, LT\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "mov p0.b, p2.b\n"
"fmla z16.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "fmla z27.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z11.h\n"
- "fmla z19.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "csel x4, x4, XZR, LT\n"
+ "cmp x16, x20\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
"fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z11.h\n"
- "fmla z16.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
"fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n"
- "fmla z19.h, p3/M, z7.h, z11.h\n"
- "fmla z18.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
"fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z10.h\n"
- "fmla z26.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z17.h, p3/M, z8.h, z11.h\n"
- "fmla z16.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14]\n"
- "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
"fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z4.h, z11.h\n"
- "fmla z22.h, p3/M, z3.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "fmla z18.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x5, LSL #1]\n"
"fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z24.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z5.h, z11.h\n"
- "fmla z20.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z11.h\n"
- "fmla z16.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z12.h\n"
- "fmla z24.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z27.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z19.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z11.h\n"
- "fmla z19.h, p3/M, z5.h, z11.h\n"
- "fmla z18.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z12.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
- "fmla z16.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n"
- "fmla z19.h, p3/M, z8.h, z10.h\n"
- "fmla z18.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z21.h, p3/M, z7.h, z11.h\n"
- "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmax z16.h, p3/M, z16.h, z14.h\n"
+ "fmax z17.h, p3/M, z17.h, z14.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z20.h, p3/M, z20.h, z14.h\n"
"fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z17.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmax z18.h, p3/M, z18.h, z14.h\n"
+ "fmax z19.h, p3/M, z19.h, z14.h\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
+ "fmax z22.h, p3/M, z22.h, z14.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
"fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z17.h, p3/M, z7.h, z12.h\n"
- "fmla z16.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z1.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "fmla z19.h, p3/M, z4.h, z12.h\n"
- "fmla z18.h, p3/M, z3.h, z12.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z10.h\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmin z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z31.h }, p0, [x16]\n"
- "fmin z30.h, p3/M, z30.h, z14.h\n"
- "fmin z29.h, p3/M, z29.h, z14.h\n"
- "st1h { z30.h }, p0, [x16, x17, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z14.h\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "st1h { z29.h }, p0, [x16, x26, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "st1h { z28.h }, p0, [x16, x22, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z14.h\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "st1h { z27.h }, p0, [x25]\n"
- "fmin z26.h, p3/M, z26.h, z14.h\n"
- "fmin z25.h, p3/M, z25.h, z14.h\n"
- "st1h { z26.h }, p0, [x25, x17, LSL #1]\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "st1h { z25.h }, p0, [x25, x26, LSL #1]\n"
- "fmax z22.h, p3/M, z22.h, z15.h\n"
- "fmax z21.h, p3/M, z21.h, z15.h\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "fmin z24.h, p3/M, z24.h, z14.h\n"
- "st1h { z24.h }, p0, [x25, x22, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z14.h\n"
- "fmin z22.h, p3/M, z22.h, z14.h\n"
- "st1h { z23.h }, p0, [x24]\n"
- "fmin z21.h, p3/M, z21.h, z14.h\n"
- "fmin z20.h, p3/M, z20.h, z14.h\n"
- "st1h { z22.h }, p0, [x24, x17, LSL #1]\n"
- "fmax z19.h, p3/M, z19.h, z15.h\n"
- "st1h { z21.h }, p0, [x24, x26, LSL #1]\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "st1h { z20.h }, p0, [x24, x22, LSL #1]\n"
- "fmin z19.h, p3/M, z19.h, z14.h\n"
- "st1h { z19.h }, p0, [x23]\n"
- "fmin z18.h, p3/M, z18.h, z14.h\n"
- "fmin z17.h, p3/M, z17.h, z14.h\n"
- "st1h { z18.h }, p0, [x23, x17, LSL #1]\n"
- "fmax z16.h, p3/M, z16.h, z15.h\n"
- "st1h { z17.h }, p0, [x23, x26, LSL #1]\n"
- "fmin z16.h, p3/M, z16.h, z14.h\n"
- "st1h { z16.h }, p0, [x23, x22, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
+ "fmax z31.h, p3/M, z31.h, z14.h\n"
+ "fmin z16.h, p3/M, z16.h, z13.h\n"
+ "st1h { z16.h }, p0, [x15]\n"
+ "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "fmin z18.h, p3/M, z18.h, z13.h\n"
+ "st1h { z17.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z19.h, p3/M, z19.h, z13.h\n"
+ "fmin z20.h, p3/M, z20.h, z13.h\n"
+ "st1h { z18.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z13.h\n"
+ "fmin z22.h, p3/M, z22.h, z13.h\n"
+ "st1h { z19.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "fmin z24.h, p3/M, z24.h, z13.h\n"
+ "st1h { z20.h }, p0, [x9]\n"
+ "fmin z25.h, p3/M, z25.h, z13.h\n"
+ "fmin z26.h, p3/M, z26.h, z13.h\n"
+ "st1h { z21.h }, p0, [x9, x6, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z13.h\n"
+ "fmin z28.h, p3/M, z28.h, z13.h\n"
+ "st1h { z22.h }, p0, [x9, x25, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z13.h\n"
+ "fmin z30.h, p3/M, z30.h, z13.h\n"
+ "st1h { z23.h }, p0, [x9, x22, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z13.h\n"
+ "st1h { z24.h }, p0, [x26]\n"
+ "st1h { z25.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z26.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z27.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z30.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z31.h }, p0, [x23, x22, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 66f6c3bb7a..972b78b6d5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,613 +98,613 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cnth x12\n"
- "ld1h { z13.h }, p3/Z, [x15]\n"
- "sub x11, XZR, x12\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z15.h }, p3/Z, [x17]\n"
+ "cnth x15\n"
+ "mov x14, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1h { z9.h }, p2/Z, [x10, x13, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "sub x13, XZR, x15\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z13\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x26, [x14, #0x20]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z13\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x25, [x14, #0x28]\n"
- "inch x11\n"
- "movprfx z29, z13\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x24, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z27, z13\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z26, z13\n fmla z26.h, p3/M, z4.h, z9.h\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z25, z13\n fmla z25.h, p3/M, z3.h, z9.h\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z23, z13\n fmla z23.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z22, z13\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z21, z13\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x13, LSL #1]\n"
- "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "ldr x26, [x14, #0x60]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z26.h, p3/M, z5.h, z12.h\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "ldr x21, [x16, #0x8]\n"
- "movprfx z24, z13\n fmla z24.h, p3/M, z3.h, z12.h\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z22.h, p3/M, z2.h, z12.h\n"
- "ldr x19, [x16, #0x18]\n"
- "fmla z21.h, p3/M, z1.h, z12.h\n"
- "movprfx z20, z13\n fmla z20.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "movprfx z19, z13\n fmla z19.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "movprfx z16, z13\n fmla z16.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z9.h\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z26.h, p3/M, z7.h, z9.h\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z25.h, p3/M, z6.h, z9.h\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z23.h, p3/M, z5.h, z9.h\n"
- "fmla z22.h, p3/M, z4.h, z9.h\n"
- "fmla z21.h, p3/M, z3.h, z9.h\n"
- "fmla z19.h, p3/M, z2.h, z9.h\n"
- "movprfx z18, z13\n fmla z18.h, p3/M, z1.h, z9.h\n"
- "movprfx z17, z13\n fmla z17.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ldr x27, [x14, #0x98]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "ld1h { z13.h }, p3/Z, [x15]\n"
- "fmla z24.h, p3/M, z6.h, z10.h\n"
- "fmla z22.h, p3/M, z5.h, z10.h\n"
- "fmla z21.h, p3/M, z4.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x26, [x16, #0x30]\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "ldr x24, [x16, #0x38]\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0x70]\n"
"fmla z16.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z27.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z11.h\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z19.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z11.h\n"
- "ldr x9, [x14, #0xc8]\n"
- "fmla z16.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z19.h, p3/M, z7.h, z11.h\n"
- "ldr x27, [x14, #0xd8]\n"
- "fmla z18.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z10.h\n"
- "ldr x28, [x14, #0xd0]\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z10.h\n"
- "fmla z26.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z17.h, p3/M, z8.h, z11.h\n"
- "ldr x26, [x14, #0xe0]\n"
- "fmla z16.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ldr x24, [x14, #0xf0]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z7.h, z11.h\n"
- "ldr x23, [x14, #0xf8]\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z4.h, z11.h\n"
- "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ldr x27, [x16, #0x60]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "inch x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "ldr x23, [x28, #0x0]\n"
+ "ldr x22, [x28, #0x8]\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "whilelt p0.h, x15, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
"fmla z19.h, p3/M, z1.h, z11.h\n"
- "fmla z18.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ldr x10, [x14, #0x100]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z10.h\n"
- "ldr x9, [x14, #0x108]\n"
+ "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
"fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "ldr x28, [x14, #0x110]\n"
- "fmla z24.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z5.h, z11.h\n"
- "fmla z20.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z11.h\n"
- "fmla z16.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z12.h\n"
- "ldr x27, [x14, #0x118]\n"
- "fmla z24.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z19.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z11.h\n"
- "fmla z19.h, p3/M, z5.h, z11.h\n"
- "fmla z18.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z12.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
- "fmla z16.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z19.h, p3/M, z8.h, z10.h\n"
- "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0xc8]\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ldr x9, [x16, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0xd0]\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
"fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z21.h, p3/M, z7.h, z11.h\n"
- "fmla z20.h, p3/M, z6.h, z11.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z17.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ldp x10, x9, [x14, #0x0]\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.h, p3/M, z8.h, z12.h\n"
- "ld1h { z9.h }, p1/Z, [x10, x12, LSL #1]\n"
- "fmla z17.h, p3/M, z7.h, z12.h\n"
- "fmla z16.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z1.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x13, LSL #1]\n"
- "inch x13\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "ldr x12, [x16, #0x100]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0x108]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x110]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "ldr x9, [x16, #0x118]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
"fmla z29.h, p3/M, z5.h, z11.h\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x28, x12, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "fmla z19.h, p3/M, z4.h, z12.h\n"
- "fmla z18.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x27, x12, LSL #1]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z20.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z10.h\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
"fmla z16.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x9, x12, LSL #1]\n"
- "inch x12\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "fmin z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z31.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z14.h\n"
- "fmin z29.h, p3/M, z29.h, z14.h\n"
- "ldr x22, [x16, #0x20]\n"
- "fmin z28.h, p3/M, z28.h, z14.h\n"
- "st1h { z30.h }, p0, [x21, x11, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z14.h\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "st1h { z29.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "st1h { z28.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "st1h { z27.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z26.h, p3/M, z26.h, z14.h\n"
- "ldr x20, [x16, #0x30]\n"
- "fmin z25.h, p3/M, z25.h, z14.h\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z24.h, p3/M, z24.h, z14.h\n"
- "ldr x22, [x16, #0x40]\n"
- "fmin z23.h, p3/M, z23.h, z14.h\n"
- "st1h { z26.h }, p0, [x21, x11, LSL #1]\n"
- "fmax z22.h, p3/M, z22.h, z15.h\n"
- "st1h { z25.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z21.h, p3/M, z21.h, z15.h\n"
- "st1h { z24.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "st1h { z23.h }, p0, [x22, x11, LSL #1]\n"
- "fmax z19.h, p3/M, z19.h, z15.h\n"
- "ldr x21, [x16, #0x48]\n"
- "fmin z22.h, p3/M, z22.h, z14.h\n"
- "ldr x20, [x16, #0x50]\n"
- "fmin z21.h, p3/M, z21.h, z14.h\n"
- "ldr x19, [x16, #0x58]\n"
- "fmin z20.h, p3/M, z20.h, z14.h\n"
- "ldr x22, [x16, #0x60]\n"
- "fmin z19.h, p3/M, z19.h, z14.h\n"
- "st1h { z22.h }, p0, [x21, x11, LSL #1]\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "st1h { z21.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "st1h { z20.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z16.h, p3/M, z16.h, z15.h\n"
- "st1h { z19.h }, p0, [x22, x11, LSL #1]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmin z18.h, p3/M, z18.h, z14.h\n"
- "ldr x20, [x16, #0x70]\n"
- "fmin z17.h, p3/M, z17.h, z14.h\n"
- "ldr x19, [x16, #0x78]\n"
- "fmin z16.h, p3/M, z16.h, z14.h\n"
- "st1h { z18.h }, p0, [x21, x11, LSL #1]\n"
- "st1h { z17.h }, p0, [x20, x11, LSL #1]\n"
- "st1h { z16.h }, p0, [x19, x11, LSL #1]\n"
+ "fmax z16.h, p3/M, z16.h, z14.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmax z17.h, p3/M, z17.h, z14.h\n"
+ "fmax z18.h, p3/M, z18.h, z14.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmax z19.h, p3/M, z19.h, z14.h\n"
+ "fmin z16.h, p3/M, z16.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmin z18.h, p3/M, z18.h, z13.h\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmin z19.h, p3/M, z19.h, z13.h\n"
+ "fmax z20.h, p3/M, z20.h, z14.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
+ "fmax z22.h, p3/M, z22.h, z14.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
+ "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x20]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "fmin z20.h, p3/M, z20.h, z13.h\n"
+ "fmin z21.h, p3/M, z21.h, z13.h\n"
+ "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "fmin z22.h, p3/M, z22.h, z13.h\n"
+ "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x28, #0x48]\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x50]\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x58]\n"
+ "inch x14\n"
+ "ld1h { z9.h }, p0/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x11, x15, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z13.h\n"
+ "ld1h { z11.h }, p0/Z, [x10, x15, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x9, x15, LSL #1]\n"
+ "inch x15\n"
+ "fmin z25.h, p3/M, z25.h, z13.h\n"
+ "fmin z26.h, p3/M, z26.h, z13.h\n"
+ "fmin z27.h, p3/M, z27.h, z13.h\n"
+ "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x60]\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x28, #0x68]\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
+ "fmax z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x70]\n"
+ "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x78]\n"
+ "ld1h { z15.h }, p3/Z, [x17]\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "fmin z28.h, p3/M, z28.h, z13.h\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "fmin z29.h, p3/M, z29.h, z13.h\n"
+ "fmin z30.h, p3/M, z30.h, z13.h\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z31.h, p3/M, z31.h, z13.h\n"
+ "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z13\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x26, [x14, #0x20]\n"
- "inch x11\n"
- "movprfx z30, z13\n fmla z30.h, p3/M, z7.h, z9.h\n"
- "ldr x25, [x14, #0x28]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z13\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x24, [x14, #0x30]\n"
- "movprfx z27, z13\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z26, z13\n fmla z26.h, p3/M, z4.h, z9.h\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z25, z13\n fmla z25.h, p3/M, z3.h, z9.h\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z23, z13\n fmla z23.h, p3/M, z2.h, z9.h\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z22, z13\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z21, z13\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x13, LSL #1]\n"
- "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "ldr x26, [x14, #0x60]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z26.h, p3/M, z5.h, z12.h\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z25.h, p3/M, z4.h, z12.h\n"
- "ldr x21, [x16, #0x8]\n"
- "movprfx z24, z13\n fmla z24.h, p3/M, z3.h, z12.h\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z22.h, p3/M, z2.h, z12.h\n"
- "ldr x19, [x16, #0x18]\n"
- "fmla z21.h, p3/M, z1.h, z12.h\n"
- "movprfx z20, z13\n fmla z20.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "movprfx z19, z13\n fmla z19.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "movprfx z16, z13\n fmla z16.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z8.h, z9.h\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z26.h, p3/M, z7.h, z9.h\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z25.h, p3/M, z6.h, z9.h\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z23.h, p3/M, z5.h, z9.h\n"
- "fmla z22.h, p3/M, z4.h, z9.h\n"
- "fmla z21.h, p3/M, z3.h, z9.h\n"
- "fmla z19.h, p3/M, z2.h, z9.h\n"
- "movprfx z18, z13\n fmla z18.h, p3/M, z1.h, z9.h\n"
- "movprfx z17, z13\n fmla z17.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ldr x27, [x14, #0x98]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z6.h, z10.h\n"
- "fmla z22.h, p3/M, z5.h, z10.h\n"
- "fmla z21.h, p3/M, z4.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x26, [x16, #0x30]\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "ldr x24, [x16, #0x38]\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0x70]\n"
"fmla z16.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z27.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z11.h\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z19.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z11.h\n"
- "ldr x9, [x14, #0xc8]\n"
- "fmla z16.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z24.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z19.h, p3/M, z7.h, z11.h\n"
- "ldr x27, [x14, #0xd8]\n"
- "fmla z18.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z10.h\n"
- "ldr x28, [x14, #0xd0]\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z10.h\n"
- "fmla z26.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z17.h, p3/M, z8.h, z11.h\n"
- "ldr x26, [x14, #0xe0]\n"
- "fmla z16.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z5.h, z12.h\n"
- "fmla z24.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ldr x24, [x14, #0xf0]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z7.h, z11.h\n"
- "ldr x23, [x14, #0xf8]\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z4.h, z11.h\n"
- "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ldr x27, [x16, #0x60]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "inch x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "ldr x23, [x28, #0x0]\n"
+ "ldr x22, [x28, #0x8]\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
"fmla z19.h, p3/M, z1.h, z11.h\n"
- "fmla z18.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "ldr x10, [x14, #0x100]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z10.h\n"
- "ldr x9, [x14, #0x108]\n"
+ "ld1h { z11.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
"fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "ldr x28, [x14, #0x110]\n"
- "fmla z24.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z5.h, z11.h\n"
- "fmla z20.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z2.h, z11.h\n"
- "fmla z16.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z12.h\n"
- "ldr x27, [x14, #0x118]\n"
- "fmla z24.h, p3/M, z5.h, z12.h\n"
- "fmla z20.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z27.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "fmla z19.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z8.h, z11.h\n"
- "fmla z19.h, p3/M, z5.h, z11.h\n"
- "fmla z18.h, p3/M, z4.h, z11.h\n"
- "fmla z17.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z12.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
- "fmla z16.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z19.h, p3/M, z8.h, z10.h\n"
- "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0xc8]\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ldr x9, [x16, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0xd0]\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
"fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x10, x13, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z21.h, p3/M, z7.h, z11.h\n"
- "fmla z20.h, p3/M, z6.h, z11.h\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z17.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z17.h, p3/M, z7.h, z12.h\n"
- "fmla z16.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z1.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x13, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "ldr x12, [x16, #0x100]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "ldr x11, [x16, #0x108]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x10, [x16, #0x110]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "ldr x9, [x16, #0x118]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
"fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "fmla z19.h, p3/M, z4.h, z12.h\n"
- "fmla z18.h, p3/M, z3.h, z12.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
"fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmin z31.h, p3/M, z31.h, z14.h\n"
- "st1h { z31.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z14.h\n"
- "fmin z29.h, p3/M, z29.h, z14.h\n"
- "ldr x22, [x16, #0x20]\n"
- "fmin z28.h, p3/M, z28.h, z14.h\n"
- "st1h { z30.h }, p0, [x21, x11, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "st1h { z29.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "st1h { z28.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "ldr x20, [x16, #0x30]\n"
- "fmin z27.h, p3/M, z27.h, z14.h\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z26.h, p3/M, z26.h, z14.h\n"
- "st1h { z27.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z14.h\n"
- "fmin z24.h, p3/M, z24.h, z14.h\n"
- "st1h { z26.h }, p0, [x21, x11, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z14.h\n"
- "ldr x22, [x16, #0x40]\n"
- "fmax z22.h, p3/M, z22.h, z15.h\n"
- "ldr x21, [x16, #0x48]\n"
- "fmax z21.h, p3/M, z21.h, z15.h\n"
- "st1h { z25.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "st1h { z24.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z19.h, p3/M, z19.h, z15.h\n"
- "st1h { z23.h }, p0, [x22, x11, LSL #1]\n"
- "fmin z22.h, p3/M, z22.h, z14.h\n"
- "ldr x20, [x16, #0x50]\n"
- "fmin z21.h, p3/M, z21.h, z14.h\n"
- "ldr x19, [x16, #0x58]\n"
- "fmin z20.h, p3/M, z20.h, z14.h\n"
- "ldr x22, [x16, #0x60]\n"
- "fmin z19.h, p3/M, z19.h, z14.h\n"
- "st1h { z22.h }, p0, [x21, x11, LSL #1]\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "st1h { z21.h }, p0, [x20, x11, LSL #1]\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "st1h { z20.h }, p0, [x19, x11, LSL #1]\n"
- "fmax z16.h, p3/M, z16.h, z15.h\n"
- "st1h { z19.h }, p0, [x22, x11, LSL #1]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmin z18.h, p3/M, z18.h, z14.h\n"
- "ldr x20, [x16, #0x70]\n"
- "fmin z17.h, p3/M, z17.h, z14.h\n"
- "ldr x19, [x16, #0x78]\n"
- "fmin z16.h, p3/M, z16.h, z14.h\n"
- "st1h { z18.h }, p0, [x21, x11, LSL #1]\n"
- "st1h { z17.h }, p0, [x20, x11, LSL #1]\n"
- "st1h { z16.h }, p0, [x19, x11, LSL #1]\n"
+ "fmax z16.h, p3/M, z16.h, z14.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmax z17.h, p3/M, z17.h, z14.h\n"
+ "fmax z18.h, p3/M, z18.h, z14.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmax z19.h, p3/M, z19.h, z14.h\n"
+ "fmin z16.h, p3/M, z16.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmin z17.h, p3/M, z17.h, z13.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "fmin z18.h, p3/M, z18.h, z13.h\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmin z19.h, p3/M, z19.h, z13.h\n"
+ "fmax z20.h, p3/M, z20.h, z14.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
+ "fmax z22.h, p3/M, z22.h, z14.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z16.h }, p1, [x23, x13, LSL #1]\n"
+ "st1h { z17.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x20]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "st1h { z18.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "st1h { z19.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmin z20.h, p3/M, z20.h, z13.h\n"
+ "fmin z21.h, p3/M, z21.h, z13.h\n"
+ "fmin z22.h, p3/M, z22.h, z13.h\n"
+ "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "fmin z23.h, p3/M, z23.h, z13.h\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x28, #0x48]\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x50]\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x58]\n"
+ "fmin z24.h, p3/M, z24.h, z13.h\n"
+ "fmin z25.h, p3/M, z25.h, z13.h\n"
+ "fmin z26.h, p3/M, z26.h, z13.h\n"
+ "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x28, #0x60]\n"
+ "fmin z27.h, p3/M, z27.h, z13.h\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x28, #0x68]\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
+ "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x28, #0x70]\n"
+ "fmax z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x28, #0x78]\n"
+ "fmin z28.h, p3/M, z28.h, z13.h\n"
+ "fmin z29.h, p3/M, z29.h, z13.h\n"
+ "fmin z30.h, p3/M, z30.h, z13.h\n"
+ "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z13.h\n"
+ "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
+ "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
+ "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index d20e9913ae..6a9b354c02 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,246 +88,246 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x7, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x16, #0x0\n"
"1:" // Tile loop
- "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x4\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x17, #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x15, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cnth x14\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x12, XZR, x14\n"
- "ldr x21, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x8, x13, x19\n" // offset += tile_j * ld_input_col
- "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
- "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x21, x21, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x9, x21, x22, LSL #1\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x28, x9, x22, LSL #1\n"
- "ld1h { z17.h }, p3/Z, [x16]\n"
- "add x27, x28, x22, LSL #1\n"
- "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "add x26, x27, x22, LSL #1\n"
- "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "add x25, x13, x13\n"
- "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
- "add x24, x25, x13\n"
- "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "add x23, x24, x13\n"
- "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
- "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
- "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "madd x19, x8, x11, x19\n" // offset += tile_j * ld_output_col
- "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
- "mul x19, x19, x17\n" // offset *= output_tile_size
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x13\n"
+ "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x15, x15\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x12, x12, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x12, x23, LSL #1\n"
+ "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z9.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x21]\n"
- "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "add x22, x10, x20, LSL #1\n"
- "ld1h { z12.h }, p2/Z, [x21, x24, LSL #1]\n"
- "addvl x16, x16, #16\n"
- "ld1h { z13.h }, p2/Z, [x21, x23, LSL #1]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "ld1h { z14.h }, p2/Z, [x9]\n"
- "ld1h { z15.h }, p2/Z, [x9, x13, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x21, x25, LSL #1]\n"
+ "ld1h { z19.h }, p3/Z, [x11]\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "add x27, x28, x23, LSL #1\n"
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "add x26, x10, x15\n"
+ "add x25, x27, x23, LSL #1\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x24, x26, x15\n"
+ "add x9, x9, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "cmp x13, %x[n_channels]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "add x22, x9, x21, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x13\n"
+ "ld1h { z9.h }, p2/Z, [x27, x10, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x12, x26, LSL #1]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1h { z13.h }, p2/Z, [x12, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x28]\n"
+ "ld1h { z15.h }, p2/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z17\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x14, %x[n_channels]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z6.h, z9.h\n"
- "inch x12\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z17.h }, p3/Z, [x16]\n"
- "inch x15\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "addvl x21, x21, #1\n"
- "ld1h { z10.h }, p1/Z, [x21]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "inch x14\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x27]\n"
- "addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z0.h, z16.h\n"
- "fmla z29.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x28]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z15.h\n"
- "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "inch x21\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "inch x13\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z14.h }, p2/Z, [x25]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
"fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x27]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "ld1h { z9.h }, p1/Z, [x28, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x26]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
- "fmla z29.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n"
- "addvl x27, x27, #1\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x21, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x21, x23, LSL #1]\n"
- "fmax z31.h, p3/M, z31.h, z19.h\n"
- "fmla z28.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n"
- "fmax z30.h, p3/M, z30.h, z19.h\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "fmin z31.h, p3/M, z31.h, z18.h\n"
- "st1h { z31.h }, p0, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z15.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n"
- "whilelt p2.h, x15, %x[n_channels]\n"
- "fmin z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
- "addvl x26, x26, #1\n"
- "fmla z28.h, p3/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p1/Z, [x21, x25, LSL #1]\n"
- "cmp x14, %x[n_channels]\n"
- "fmax z29.h, p3/M, z29.h, z19.h\n"
- "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "st1h { z30.h }, p0, [x10, x11, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z14.h\n"
- "ld1h { z14.h }, p1/Z, [x9]\n"
- "addvl x10, x10, #1\n"
- "fmin z29.h, p3/M, z29.h, z18.h\n"
- "st1h { z29.h }, p0, [x22]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ld1h { z19.h }, p3/Z, [x11]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "inch x20\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
"fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p1/Z, [x9, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
- "fmax z28.h, p3/M, z28.h, z19.h\n"
- "addvl x16, x16, #16\n"
- "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z28.h, p3/M, z28.h, z18.h\n"
- "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "st1h { z28.h }, p0, [x22, x11, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x23]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
+ "cmp x13, %x[n_channels]\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "ld1h { z10.h }, p1/Z, [x12]\n"
+ "ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "addvl x25, x25, #1\n"
+ "ld1h { z14.h }, p1/Z, [x28]\n"
+ "ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
+ "st1h { z28.h }, p0, [x9]\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z17\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z6.h, z9.h\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x7, #0x1\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x8, x8, #0x1\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "cmp x8, x19\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
- "csel x8, x8, XZR, LT\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n"
- "csel x7, x7, x21, LT\n"
- "fmla z31.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x27]\n"
- "cmp x7, x20\n"
- "fmla z30.h, p3/M, z0.h, z16.h\n"
- "fmla z29.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x28]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z15.h\n"
- "fmla z31.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
"ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z14.h }, p2/Z, [x25]\n"
+ "add x16, x16, #0x1\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
"fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x26]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z29.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "fmax z31.h, p3/M, z31.h, z19.h\n"
- "fmla z28.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n"
- "fmax z30.h, p3/M, z30.h, z19.h\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "fmin z31.h, p3/M, z31.h, z18.h\n"
- "st1h { z31.h }, p0, [x10]\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z15.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z18.h\n"
- "st1h { z30.h }, p0, [x10, x11, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z16.h\n"
- "fmax z29.h, p3/M, z29.h, z19.h\n"
- "fmla z28.h, p3/M, z7.h, z14.h\n"
- "fmin z29.h, p3/M, z29.h, z18.h\n"
- "st1h { z29.h }, p0, [x22]\n"
+ "ld1h { z15.h }, p2/Z, [x27]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "cmp x16, x20\n"
+ "add x21, x11, #0x1\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z14.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "csel x11, x11, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x23, x26, LSL #1]\n"
"fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmax z28.h, p3/M, z28.h, z19.h\n"
- "fmin z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z28.h }, p0, [x22, x11, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x23]\n"
+ "csel x16, x16, XZR, LT\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "cmp x11, x20\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x9]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x14, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index ceba36d897..ff97b51e28 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,247 +87,247 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cnth x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cnth x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z17.h }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "ld1h { z11.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x22, x13, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ld1h { z15.h }, p2/Z, [x20, x13, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x19, x13, LSL #1]\n"
+ "ld1h { z19.h }, p3/Z, [x16]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z17\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x26, [x14, #0x40]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z6.h, z9.h\n"
- "ldr x25, [x14, #0x48]\n"
- "inch x9\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x24, [x14, #0x50]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ldr x23, [x14, #0x58]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z30.h, p3/M, z0.h, z16.h\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z29.h, p3/M, z3.h, z14.h\n"
- "ldr x25, [x14, #0x88]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z15.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
"fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "ldr x20, [x14, #0xb0]\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z15.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x22, x13, LSL #1]\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z31.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z17.h }, p3/Z, [x15]\n"
- "fmla z29.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z5.h, z14.h\n"
- "fmax z31.h, p3/M, z31.h, z19.h\n"
- "ld1h { z14.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "fmax z30.h, p3/M, z30.h, z19.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "inch x13\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmin z31.h, p3/M, z31.h, z18.h\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "fmin z30.h, p3/M, z30.h, z18.h\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "fmla z28.h, p3/M, z3.h, z16.h\n"
- "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z15.h\n"
- "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z14.h\n"
- "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x22, x12, LSL #1]\n"
- "fmax z29.h, p3/M, z29.h, z19.h\n"
- "ld1h { z14.h }, p1/Z, [x21, x12, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x88]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
"fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p1/Z, [x20, x12, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x19, x12, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z18.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
- "inch x12\n"
- "fmax z28.h, p3/M, z28.h, z19.h\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "blt 1b\n"
- "2:" // Channel tail
- "movprfx z31, z17\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x26, [x14, #0x40]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x23, [x15, #0xa0]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "ldr x27, [x15, #0xc0]\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"inch x9\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z6.h, z9.h\n"
- "ldr x25, [x14, #0x48]\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "inch x28\n"
+ "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
"mov p0.b, p2.b\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "ldr x24, [x14, #0x50]\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z0.h, z9.h\n"
- "ldr x23, [x14, #0x58]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z30.h, p3/M, z0.h, z16.h\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z29.h, p3/M, z3.h, z14.h\n"
- "ldr x25, [x14, #0x88]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z15.h\n"
- "ld1h { z14.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "whilelt p2.h, x9, %x[n_channels]\n"
+ "ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "inch x14\n"
+ "ld1h { z19.h }, p3/Z, [x16]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
"fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z31.h, p3/M, z5.h, z13.h\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "ldr x20, [x14, #0xb0]\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z15.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "ld1h { z13.h }, p2/Z, [x22, x13, LSL #1]\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z31.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmax z31.h, p3/M, z31.h, z19.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "fmax z30.h, p3/M, z30.h, z19.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z18.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "fmla z28.h, p3/M, z3.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z18.h\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z14.h\n"
- "fmla z29.h, p3/M, z8.h, z15.h\n"
+ "ld1h { z14.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ld1h { z15.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x88]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x9, LSL #1]\n"
"fmla z28.h, p3/M, z6.h, z15.h\n"
- "fmax z29.h, p3/M, z29.h, z19.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmin z29.h, p3/M, z29.h, z18.h\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "fmax z28.h, p3/M, z28.h, z19.h\n"
- "fmin z28.h, p3/M, z28.h, z18.h\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x23, [x15, #0xa0]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z15.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "ldr x27, [x15, #0xc0]\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z28.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z29.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x10, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 1c2e1e27ad..e6bfea1790 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,432 +88,432 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x5, #0x0\n"
- "mov x6, #0x0\n"
+ "mov x12, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x20, #0x2\n"
- "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x7, #0x2\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x17, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cnth x16\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x14, XZR, x16\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+ "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
"ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
- "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
- "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x13, x13, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "add x15, x17, x17\n"
+ "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cnth x12\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x11, x14, x23, LSL #1\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x11, x23, LSL #1\n"
+ "add x28, x15, x17\n"
"ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x20, x13, x22, LSL #1\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x10, x20, x22, LSL #1\n"
- "ld1h { z16.h }, p3/Z, [x8]\n"
- "add x9, x10, x22, LSL #1\n"
- "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
- "add x28, x9, x22, LSL #1\n"
- "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
- "add x27, x28, x22, LSL #1\n"
- "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
- "add x26, x15, x15\n"
- "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
- "add x25, x26, x15\n"
- "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
- "add x24, x25, x15\n"
- "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
- "add x23, x24, x15\n"
- "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x7\n" // offset *= output_tile_size
- "add x11, x11, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "add x22, x11, x21, LSL #1\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z5.h }, p2/Z, [x13]\n"
- "ld1h { z6.h }, p2/Z, [x13, x15, LSL #1]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1h { z7.h }, p2/Z, [x20]\n"
- "addvl x8, x8, #6\n"
- "ld1h { z8.h }, p2/Z, [x20, x15, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x13, x26, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x20, x26, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x13, x25, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x13, x24, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x20, x23, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x10]\n"
+ "add x27, x9, x23, LSL #1\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x26, x28, x17\n"
+ "add x25, x27, x23, LSL #1\n"
+ "ld1h { z16.h }, p3/Z, [x10]\n"
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x24, x26, x17\n"
+ "add x13, x13, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "add x22, x13, x21, LSL #1\n"
+ "mov x21, #0x0\n"
+ "ld1h { z5.h }, p2/Z, [x14]\n"
+ "ld1h { z6.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "sub x20, XZR, x12\n"
+ "ld1h { z7.h }, p2/Z, [x11]\n"
+ "ld1h { z8.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "addvl x10, x10, #6\n"
+ "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x14, x28, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x11, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n"
- "whilelt p1.h, x16, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z6.h\n"
- "inch x14\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z7.h\n"
+ "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
+ "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "whilelt p1.h, x12, %x[n_channels]\n"
+ "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
+ "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x10]\n"
+ "inch x21\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "inch x12\n"
+ "fmla z30.h, p3/M, z1.h, z8.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x8]\n"
- "inch x17\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n"
- "addvl x20, x20, #1\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "inch x16\n"
- "fmla z29.h, p3/M, z1.h, z8.h\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n"
- "addvl x13, x13, #1\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z5.h\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z6.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z7.h\n"
- "ld1h { z7.h }, p1/Z, [x20]\n"
- "fmla z30.h, p3/M, z0.h, z8.h\n"
- "fmla z29.h, p3/M, z0.h, z14.h\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n"
- "addvl x10, x10, #1\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
"fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmla z31.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x9]\n"
- "ld1h { z16.h }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z5.h\n"
- "fmla z28.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z6.h\n"
- "fmla z28.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n"
- "addvl x9, x9, #1\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x28]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z6.h\n"
- "fmla z29.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "inch x20\n"
+ "fmla z30.h, p3/M, z3.h, z5.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z6.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z7.h }, p1/Z, [x11]\n"
+ "fmla z30.h, p3/M, z0.h, z14.h\n"
+ "fmla z31.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z8.h\n"
"fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x8]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n"
- "addvl x28, x28, #1\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
"fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z28.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z28.h, p3/M, z3.h, z5.h\n"
"fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z28.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z14.h\n"
- "ld1h { z14.h }, p1/Z, [x10]\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z5.h }, p2/Z, [x27]\n"
+ "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z14.h\n"
"fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x20, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z5.h\n"
+ "fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
"fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z9.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z5.h }, p1/Z, [x13]\n"
- "fmla z30.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
"fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n"
- "whilelt p2.h, x17, %x[n_channels]\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
"addvl x27, x27, #1\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z6.h }, p1/Z, [x13, x15, LSL #1]\n"
- "addvl x8, x8, #16\n"
- "fmla z30.h, p3/M, z3.h, z8.h\n"
- "cmp x16, %x[n_channels]\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x13, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p1/Z, [x20, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x20, x23, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x13, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x13, x26, LSL #1]\n"
- "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "addvl x8, x8, #-6\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z5.h\n"
+ "fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "fmla z31.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x10]\n"
+ "fmla z28.h, p3/M, z2.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "fmla z31.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x23]\n"
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "fmla z31.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x9]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "fmla z29.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "fmla z29.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z5.h }, p1/Z, [x14]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "cmp x12, %x[n_channels]\n"
+ "addvl x23, x23, #1\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
"fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x11]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x11, x12, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
"fmin z28.h, p3/M, z28.h, z17.h\n"
- "addvl x11, x11, #1\n"
- "st1h { z29.h }, p0, [x22]\n"
- "st1h { z28.h }, p0, [x22, x12, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
+ "ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
+ "st1h { z28.h }, p0, [x13]\n"
+ "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
+ "addvl x13, x13, #1\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "addvl x10, x10, #-6\n"
+ "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z6.h\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x5, #0x1\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z7.h\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x8]\n"
- "add x6, x6, #0x1\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z5.h\n"
+ "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z7.h\n"
+ "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x10]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z30.h, p3/M, z1.h, z8.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x8, x8, #0x1\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "cmp x8, x20\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "add x21, x12, #0x1\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x17, LSL #1]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z29.h, p3/M, z1.h, z8.h\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x6, x19\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z3.h, z5.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "csel x12, x12, x21, LT\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z6.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "csel x8, x8, XZR, LT\n"
+ "cmp x12, x20\n"
+ "fmla z30.h, p3/M, z0.h, z14.h\n"
+ "fmla z31.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z8.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z8.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
"fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n"
- "csel x6, x6, XZR, LT\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "csel x5, x5, x21, LT\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "cmp x5, x20\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z5.h\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z6.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z7.h\n"
- "fmla z30.h, p3/M, z0.h, z8.h\n"
- "fmla z29.h, p3/M, z0.h, z14.h\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z28.h, p3/M, z3.h, z5.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x27]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z6.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z14.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z14.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z5.h\n"
+ "fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z5.h\n"
+ "fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z6.h }, p2/Z, [x25, x28, LSL #1]\n"
"fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x10]\n"
+ "fmla z28.h, p3/M, z2.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x24, LSL #1]\n"
"fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmla z31.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x9]\n"
+ "fmla z31.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x23]\n"
"fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z5.h\n"
- "fmla z28.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "fmla z29.h, p3/M, z1.h, z5.h\n"
"fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z6.h\n"
- "fmla z28.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "fmla z29.h, p3/M, z2.h, z6.h\n"
"fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x28]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z6.h\n"
- "fmla z29.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x8]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z28.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z28.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z14.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z1.h, z5.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "fmla z30.h, p3/M, z2.h, z6.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "fmla z30.h, p3/M, z3.h, z8.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z9.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
"fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x11]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x11, x12, LSL #1]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
"fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z29.h }, p0, [x22]\n"
- "st1h { z28.h }, p0, [x22, x12, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z28.h }, p0, [x13]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z29.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x16, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index a0640daeca..2e20b524d8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,450 +98,450 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x15, x14, [x20, #0x0]\n"
"mov x13, #0x0\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cnth x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
- "whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z16.h }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ld1h { z5.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ld1h { z6.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ld1h { z7.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ld1h { z8.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ld1h { z9.h }, p2/Z, [x22, x13, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "ldp x26, x25, [x14, #0x40]\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x25, x13, LSL #1]\n"
+ "ldp x12, x11, [x20, #0x10]\n"
+ "whilelt p3.h, XZR, %x[n_channels]\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "cnth x28\n"
+ "ptrue p2.b\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1h { z5.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "cmp x28, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldp x26, x25, [x16, #0x10]\n"
+ "sub x24, XZR, x28\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ldp x10, x9, [x16, #0x40]\n"
+ "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z17.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z16.h }, p2/Z, [x27]\n"
+ "ld1h { z0.h }, p2/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x27, #4, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x27, #5, MUL VL]\n"
+ "ld1h { z7.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "addvl x27, x27, #6\n"
+ "ld1h { z8.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x9, x13, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z5.h\n"
- "ldr x24, [x14, #0x50]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z6.h\n"
- "ldr x23, [x14, #0x58]\n"
- "inch x9\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z7.h\n"
- "ldr x22, [x14, #0x60]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z8.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x15]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z29.h, p3/M, z1.h, z8.h\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ldr x25, [x14, #0x88]\n"
- "fmla z29.h, p3/M, z3.h, z5.h\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z6.h\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z7.h\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z30.h, p3/M, z0.h, z8.h\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z29.h, p3/M, z0.h, z14.h\n"
- "ldr x20, [x14, #0xb0]\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "ldr x25, [x14, #0xc8]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmla z31.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x14, #0xd0]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "ld1h { z16.h }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x14, #0xd8]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "ldr x22, [x14, #0xe0]\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "ldr x19, [x14, #0xf8]\n"
- "fmla z29.h, p3/M, z0.h, z5.h\n"
- "fmla z28.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #-6, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x14, #0xe8]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z6.h\n"
- "fmla z28.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #-5, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x14, #0xf0]\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #-4, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x14, #0x100]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #-3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x14, #0x108]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #-2, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x14, #0x110]\n"
- "fmla z30.h, p3/M, z0.h, z6.h\n"
- "fmla z29.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #-1, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x14, #0x118]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x15]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z28.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z28.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z14.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z1.h, z5.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z9.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z5.h }, p1/Z, [x26, x12, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z6.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x25, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z30.h, p2/M, z1.h, z8.h\n"
+ "fmla z31.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x27]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p2/M, z2.h, z9.h\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z2.h, z13.h\n"
+ "fmla z31.h, p2/M, z2.h, z5.h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla z30.h, p2/M, z3.h, z5.h\n"
+ "fmla z31.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "fmla z29.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z6.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "fmla z29.h, p2/M, z0.h, z8.h\n"
+ "ldr x25, [x16, #0x98]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.h, p2/M, z0.h, z14.h\n"
+ "fmla z31.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p2/M, z1.h, z8.h\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.h, p2/M, z1.h, z11.h\n"
+ "fmla z31.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z28.h, p2/M, z2.h, z13.h\n"
+ "fmla z29.h, p2/M, z2.h, z5.h\n"
+ "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla z30.h, p2/M, z2.h, z12.h\n"
+ "fmla z31.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "fmla z28.h, p2/M, z3.h, z5.h\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla z30.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ldr x26, [x16, #0xd0]\n"
+ "fmla z28.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z13.h\n"
+ "fmla z31.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
+ "ldr x25, [x16, #0xd8]\n"
+ "fmla z28.h, p2/M, z0.h, z14.h\n"
+ "fmla z29.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.h, p2/M, z0.h, z5.h\n"
+ "fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z28.h, p2/M, z1.h, z11.h\n"
+ "fmla z29.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z31.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
+ "whilelt p1.h, x28, %x[n_channels]\n"
+ "fmla z28.h, p2/M, z2.h, z12.h\n"
+ "fmla z29.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z30.h, p2/M, z2.h, z10.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
+ "inch x24\n"
+ "fmla z28.h, p2/M, z3.h, z9.h\n"
+ "fmla z29.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z28.h, p2/M, z4.h, z13.h\n"
+ "fmla z29.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla z28.h, p2/M, z0.h, z5.h\n"
+ "fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ldr x26, [x16, #0x110]\n"
+ "fmla z30.h, p2/M, z0.h, z9.h\n"
+ "fmla z31.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
+ "ld1h { z16.h }, p2/Z, [x27, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0x118]\n"
+ "fmla z30.h, p2/M, z1.h, z13.h\n"
+ "fmla z31.h, p2/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p2/Z, [x27]\n"
+ "fmla z28.h, p2/M, z2.h, z10.h\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z2.h, z5.h\n"
+ "fmla z31.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z3.h, z6.h\n"
+ "fmla z31.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "fmla z29.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z8.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z9.h\n"
+ "fmla z29.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z0.h, z11.h\n"
+ "fmla z31.h, p2/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla z28.h, p2/M, z1.h, z13.h\n"
+ "fmla z29.h, p2/M, z1.h, z5.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
+ "fmla z30.h, p2/M, z1.h, z12.h\n"
+ "fmla z31.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
+ "fmla z28.h, p2/M, z2.h, z5.h\n"
+ "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z5.h }, p1/Z, [x10, x28, LSL #1]\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldp x26, x25, [x16, #0x10]\n"
+ "fmla z28.h, p2/M, z3.h, z6.h\n"
+ "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z6.h }, p1/Z, [x9, x28, LSL #1]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ldp x10, x9, [x16, #0x40]\n"
+ "fmla z28.h, p2/M, z4.h, z8.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
"inch x13\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "whilelt p2.h, x13, %x[n_channels]\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z6.h }, p1/Z, [x25, x12, LSL #1]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "fmla z30.h, p3/M, z3.h, z8.h\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "ld1h { z7.h }, p1/Z, [x24, x12, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z13.h }, p1/Z, [x21, x12, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p1/Z, [x23, x12, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z11.h }, p1/Z, [x20, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x19, x12, LSL #1]\n"
- "fmla z28.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x22, x12, LSL #1]\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "ldp x26, x25, [x14, #0x40]\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #7, MUL VL]\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "addvl x15, x15, #16\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "ld1h { z10.h }, p1/Z, [x26, x12, LSL #1]\n"
- "ld1h { z14.h }, p1/Z, [x25, x12, LSL #1]\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "inch x12\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
+ "ld1h { z7.h }, p1/Z, [x26, x28, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z8.h }, p1/Z, [x25, x28, LSL #1]\n"
+ "ld1h { z9.h }, p1/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x21, x28, LSL #1]\n"
+ "fmax z28.h, p2/M, z28.h, z18.h\n"
+ "fmax z29.h, p2/M, z29.h, z18.h\n"
+ "ld1h { z12.h }, p1/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x10, x28, LSL #1]\n"
+ "fmax z30.h, p2/M, z30.h, z18.h\n"
+ "fmax z31.h, p2/M, z31.h, z18.h\n"
+ "ld1h { z14.h }, p1/Z, [x9, x28, LSL #1]\n"
+ "inch x28\n"
+ "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "whilelt p3.h, x13, %x[n_channels]\n"
+ "cmp x28, %x[n_channels]\n"
+ "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
+ "fmin z28.h, p2/M, z28.h, z17.h\n"
+ "fmin z29.h, p2/M, z29.h, z17.h\n"
+ "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
+ "fmin z30.h, p2/M, z30.h, z17.h\n"
+ "fmin z31.h, p2/M, z31.h, z17.h\n"
+ "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
+ "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
+ "addvl x27, x27, #-6\n"
+ "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z5.h\n"
- "ldr x24, [x14, #0x50]\n"
- "inch x9\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z0.h, z6.h\n"
- "ldr x23, [x14, #0x58]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z0.h, z7.h\n"
- "ldr x22, [x14, #0x60]\n"
- "movprfx z28, z16\n fmla z28.h, p3/M, z0.h, z8.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x15]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z9.h\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z29.h, p3/M, z1.h, z8.h\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #1, MUL VL]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #2, MUL VL]\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ldr x25, [x14, #0x88]\n"
- "fmla z29.h, p3/M, z3.h, z5.h\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #3, MUL VL]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z6.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #4, MUL VL]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.h, p3/M, z0.h, z7.h\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z30.h, p3/M, z0.h, z8.h\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z29.h, p3/M, z0.h, z14.h\n"
- "fmla z28.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #5, MUL VL]\n"
- "ldr x20, [x14, #0xb0]\n"
- "fmla z31.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #6, MUL VL]\n"
- "ldr x25, [x14, #0xc8]\n"
- "fmla z31.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmla z31.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x14, #0xd0]\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x14, #0xd8]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #-7, MUL VL]\n"
- "ldr x22, [x14, #0xe0]\n"
- "fmla z31.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "ldr x19, [x14, #0xf8]\n"
- "fmla z29.h, p3/M, z0.h, z5.h\n"
- "fmla z28.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #-6, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x14, #0xe8]\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z1.h, z6.h\n"
- "fmla z28.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x15, #-5, MUL VL]\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x14, #0xf0]\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #-4, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "ldr x26, [x14, #0x100]\n"
- "fmla z30.h, p3/M, z3.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #-3, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x14, #0x108]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x22, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #-2, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x14, #0x110]\n"
- "fmla z30.h, p3/M, z0.h, z6.h\n"
- "fmla z29.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x15, #-1, MUL VL]\n"
- "fmla z31.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x14, #0x118]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x15]\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z28.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z28.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x19, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z14.h\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z13.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z0.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z1.h, z5.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "fmla z30.h, p3/M, z2.h, z6.h\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x23, x13, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "fmla z30.h, p3/M, z3.h, z8.h\n"
- "fmla z29.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z9.h\n"
- "fmax z31.h, p3/M, z31.h, z18.h\n"
- "fmax z30.h, p3/M, z30.h, z18.h\n"
- "fmax z29.h, p3/M, z29.h, z18.h\n"
- "fmax z28.h, p3/M, z28.h, z18.h\n"
- "fmin z31.h, p3/M, z31.h, z17.h\n"
- "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
- "fmin z30.h, p3/M, z30.h, z17.h\n"
- "fmin z29.h, p3/M, z29.h, z17.h\n"
- "st1h { z30.h }, p0, [x10, x9, LSL #1]\n"
- "fmin z28.h, p3/M, z28.h, z17.h\n"
- "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
- "st1h { z28.h }, p0, [x27, x9, LSL #1]\n"
+ "movprfx z28, z16\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z16\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "movprfx z30, z16\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z16\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x25, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z30.h, p2/M, z1.h, z8.h\n"
+ "fmla z31.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x27]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p2/M, z2.h, z9.h\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z2.h, z13.h\n"
+ "fmla z31.h, p2/M, z2.h, z5.h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla z30.h, p2/M, z3.h, z5.h\n"
+ "fmla z31.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #3, MUL VL]\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "fmla z29.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z6.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #4, MUL VL]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "fmla z29.h, p2/M, z0.h, z8.h\n"
+ "ldr x25, [x16, #0x98]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.h, p2/M, z0.h, z14.h\n"
+ "fmla z31.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #5, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p2/M, z1.h, z8.h\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z8.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.h, p2/M, z1.h, z11.h\n"
+ "fmla z31.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z28.h, p2/M, z2.h, z13.h\n"
+ "fmla z29.h, p2/M, z2.h, z5.h\n"
+ "ld1h { z13.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla z30.h, p2/M, z2.h, z12.h\n"
+ "fmla z31.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "fmla z28.h, p2/M, z3.h, z5.h\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla z30.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ldr x26, [x16, #0xd0]\n"
+ "fmla z28.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z13.h\n"
+ "fmla z31.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #-7, MUL VL]\n"
+ "ldr x25, [x16, #0xd8]\n"
+ "fmla z28.h, p2/M, z0.h, z14.h\n"
+ "fmla z29.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.h, p2/M, z0.h, z5.h\n"
+ "fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #-6, MUL VL]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z28.h, p2/M, z1.h, z11.h\n"
+ "fmla z29.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z31.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, #-5, MUL VL]\n"
+ "inch x24\n"
+ "fmla z28.h, p2/M, z2.h, z12.h\n"
+ "fmla z29.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z30.h, p2/M, z2.h, z10.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z28.h, p2/M, z3.h, z9.h\n"
+ "fmla z29.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #-3, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z13.h\n"
+ "fmla z29.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z13.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #-2, MUL VL]\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla z28.h, p2/M, z0.h, z5.h\n"
+ "fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ldr x26, [x16, #0x110]\n"
+ "fmla z30.h, p2/M, z0.h, z9.h\n"
+ "fmla z31.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, #-1, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z6.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0x118]\n"
+ "fmla z30.h, p2/M, z1.h, z13.h\n"
+ "fmla z31.h, p2/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p2/Z, [x27]\n"
+ "fmla z28.h, p2/M, z2.h, z10.h\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z2.h, z5.h\n"
+ "fmla z31.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z3.h, z6.h\n"
+ "fmla z31.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "fmla z29.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z12.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z8.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x27, #3, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z9.h\n"
+ "fmla z29.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z0.h, z11.h\n"
+ "fmla z31.h, p2/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x9, x13, LSL #1]\n"
+ "fmla z28.h, p2/M, z1.h, z13.h\n"
+ "fmla z29.h, p2/M, z1.h, z5.h\n"
+ "fmla z30.h, p2/M, z1.h, z12.h\n"
+ "fmla z31.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z28.h, p2/M, z2.h, z5.h\n"
+ "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z28.h, p2/M, z3.h, z6.h\n"
+ "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "fmla z28.h, p2/M, z4.h, z8.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "fmax z28.h, p2/M, z28.h, z18.h\n"
+ "fmax z29.h, p2/M, z29.h, z18.h\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z9.h\n"
+ "fmax z30.h, p2/M, z30.h, z18.h\n"
+ "fmax z31.h, p2/M, z31.h, z18.h\n"
+ "fmin z28.h, p2/M, z28.h, z17.h\n"
+ "fmin z29.h, p2/M, z29.h, z17.h\n"
+ "st1h { z28.h }, p0, [x15, x24, LSL #1]\n"
+ "fmin z30.h, p2/M, z30.h, z17.h\n"
+ "fmin z31.h, p2/M, z31.h, z17.h\n"
+ "st1h { z29.h }, p0, [x14, x24, LSL #1]\n"
+ "st1h { z30.h }, p0, [x12, x24, LSL #1]\n"
+ "st1h { z31.h }, p0, [x11, x24, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
deleted file mode 100644
index eddcffc196..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <cstddef>
-#include <cstdint>
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-
-namespace arm_conv {
-namespace depthwise {
-
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
- const float *const *const input_ptrs,
- float *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const float activation_min,
- const float activation_max
-)
-{
- const float *const inptrs[16] = {
- input_ptrs[0], input_ptrs[1], input_ptrs[4], input_ptrs[5], input_ptrs[2], input_ptrs[6], input_ptrs[3], input_ptrs[7], input_ptrs[8], input_ptrs[9], input_ptrs[10], input_ptrs[11], input_ptrs[12], input_ptrs[13], input_ptrs[14], input_ptrs[15],
- };
- const float minmax_vals[2] = { activation_min, activation_max };
-
- __asm__ __volatile__(
- "ldp x26, x23, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x25, x16, [%x[inptrs], #0x10]\n"
- "mov x15, #0x0\n"
- "ld1w { z15.s }, p2/Z, [%x[params]]\n"
- "mov z14.d, z15.d\n"
- "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "cntw x14\n"
- "mov z12.d, z15.d\n"
- "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sub x13, XZR, x14\n"
- "mov z10.d, z15.d\n"
- "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "whilelt p1.s, XZR, %x[n_channels]\n"
- "mov z8.d, z15.d\n"
- "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
- "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "addvl %x[params], %x[params], #-6\n"
- "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
- "ldp x24, x12, [%x[inptrs], #0x20]\n"
- "ldp x23, x11, [%x[inptrs], #0x30]\n"
- "ldp x10, x9, [%x[inptrs], #0x40]\n"
- "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
- "ldp x28, x27, [%x[inptrs], #0x50]\n"
- "ldp x26, x25, [%x[inptrs], #0x60]\n"
- "ldp x24, x23, [%x[inptrs], #0x70]\n"
- "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
- "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
- "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ldp x22, x21, [%x[outptrs], #0x0]\n"
- "ldp x20, x19, [%x[outptrs], #0x10]\n"
- "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
- "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
- "bge 1f\n"
- "1:" // Loop
- "fmla z14.s, p2/M, z13.s, z3.s\n"
- "ld1w { z15.s }, p2/Z, [%x[params]]\n"
- "incw x13\n"
- "fmla z12.s, p2/M, z13.s, z0.s\n"
- "ldp x26, x23, [%x[inptrs], #0x0]\n"
- "mov p0.b, p1.b\n"
- "fmla z10.s, p2/M, z13.s, z31.s\n"
- "ldp x25, x16, [%x[inptrs], #0x10]\n"
- "mov x15, x14\n"
- "fmla z8.s, p2/M, z13.s, z30.s\n"
- "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "incw x14\n"
- "fmla z14.s, p2/M, z11.s, z0.s\n"
- "ldp x24, x12, [%x[inptrs], #0x20]\n"
- "whilelt p1.s, x15, %x[n_channels]\n"
- "fmla z12.s, p2/M, z11.s, z29.s\n"
- "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
- "cmp x14, %x[n_channels]\n"
- "fmla z10.s, p2/M, z11.s, z30.s\n"
- "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
- "ldp x23, x11, [%x[inptrs], #0x30]\n"
- "fmla z8.s, p2/M, z11.s, z28.s\n"
- "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "fmla z14.s, p2/M, z9.s, z29.s\n"
- "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z9.s, z27.s\n"
- "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
- "fmla z10.s, p2/M, z9.s, z28.s\n"
- "ldp x10, x9, [%x[inptrs], #0x40]\n"
- "fmla z8.s, p2/M, z9.s, z26.s\n"
- "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "fmla z14.s, p2/M, z7.s, z31.s\n"
- "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z7.s, z30.s\n"
- "ldp x28, x27, [%x[inptrs], #0x50]\n"
- "fmla z10.s, p2/M, z7.s, z25.s\n"
- "ldp x26, x25, [%x[inptrs], #0x60]\n"
- "fmla z8.s, p2/M, z7.s, z24.s\n"
- "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "fmla z14.s, p2/M, z6.s, z30.s\n"
- "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z6.s, z28.s\n"
- "ldp x24, x23, [%x[inptrs], #0x70]\n"
- "fmla z10.s, p2/M, z6.s, z24.s\n"
- "fmla z8.s, p2/M, z6.s, z23.s\n"
- "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z14.s, p2/M, z5.s, z28.s\n"
- "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z5.s, z26.s\n"
- "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
- "fmla z10.s, p2/M, z5.s, z23.s\n"
- "fmla z8.s, p2/M, z5.s, z22.s\n"
- "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmla z14.s, p2/M, z4.s, z25.s\n"
- "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z4.s, z24.s\n"
- "fmla z10.s, p2/M, z4.s, z21.s\n"
- "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
- "fmla z8.s, p2/M, z4.s, z20.s\n"
- "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "fmla z14.s, p2/M, z2.s, z24.s\n"
- "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z2.s, z23.s\n"
- "fmla z10.s, p2/M, z2.s, z20.s\n"
- "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
- "fmla z8.s, p2/M, z2.s, z19.s\n"
- "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "fmla z14.s, p2/M, z1.s, z23.s\n"
- "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
- "fmla z12.s, p2/M, z1.s, z22.s\n"
- "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
- "fmla z10.s, p2/M, z1.s, z19.s\n"
- "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
- "fmla z8.s, p2/M, z1.s, z18.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "addvl %x[params], %x[params], #-6\n"
- "fmax z14.s, p2/M, z14.s, z17.s\n"
- "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
- "fmax z12.s, p2/M, z12.s, z17.s\n"
- "fmax z10.s, p2/M, z10.s, z17.s\n"
- "fmax z8.s, p2/M, z8.s, z17.s\n"
- "fmin z14.s, p2/M, z14.s, z16.s\n"
- "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
- "mov z14.d, z15.d\n"
- "fmin z12.s, p2/M, z12.s, z16.s\n"
- "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
- "mov z12.d, z15.d\n"
- "fmin z10.s, p2/M, z10.s, z16.s\n"
- "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
- "mov z10.d, z15.d\n"
- "fmin z8.s, p2/M, z8.s, z16.s\n"
- "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
- "mov z8.d, z15.d\n"
- "blt 1b\n"
- "2:" // Tail
- "fmla z14.s, p2/M, z13.s, z3.s\n"
- "incw x13\n"
- "fmla z12.s, p2/M, z13.s, z0.s\n"
- "mov p0.b, p1.b\n"
- "fmla z10.s, p2/M, z13.s, z31.s\n"
- "fmla z8.s, p2/M, z13.s, z30.s\n"
- "fmla z14.s, p2/M, z11.s, z0.s\n"
- "fmla z12.s, p2/M, z11.s, z29.s\n"
- "fmla z10.s, p2/M, z11.s, z30.s\n"
- "fmla z8.s, p2/M, z11.s, z28.s\n"
- "fmla z14.s, p2/M, z9.s, z29.s\n"
- "fmla z12.s, p2/M, z9.s, z27.s\n"
- "fmla z10.s, p2/M, z9.s, z28.s\n"
- "fmla z8.s, p2/M, z9.s, z26.s\n"
- "fmla z14.s, p2/M, z7.s, z31.s\n"
- "fmla z12.s, p2/M, z7.s, z30.s\n"
- "fmla z10.s, p2/M, z7.s, z25.s\n"
- "fmla z8.s, p2/M, z7.s, z24.s\n"
- "fmla z14.s, p2/M, z6.s, z30.s\n"
- "fmla z12.s, p2/M, z6.s, z28.s\n"
- "fmla z10.s, p2/M, z6.s, z24.s\n"
- "fmla z8.s, p2/M, z6.s, z23.s\n"
- "fmla z14.s, p2/M, z5.s, z28.s\n"
- "fmla z12.s, p2/M, z5.s, z26.s\n"
- "fmla z10.s, p2/M, z5.s, z23.s\n"
- "fmla z8.s, p2/M, z5.s, z22.s\n"
- "fmla z14.s, p2/M, z4.s, z25.s\n"
- "fmla z12.s, p2/M, z4.s, z24.s\n"
- "fmla z10.s, p2/M, z4.s, z21.s\n"
- "fmla z8.s, p2/M, z4.s, z20.s\n"
- "fmla z14.s, p2/M, z2.s, z24.s\n"
- "fmla z12.s, p2/M, z2.s, z23.s\n"
- "fmla z10.s, p2/M, z2.s, z20.s\n"
- "fmla z8.s, p2/M, z2.s, z19.s\n"
- "fmla z14.s, p2/M, z1.s, z23.s\n"
- "fmla z12.s, p2/M, z1.s, z22.s\n"
- "fmla z10.s, p2/M, z1.s, z19.s\n"
- "fmla z8.s, p2/M, z1.s, z18.s\n"
- "fmax z14.s, p2/M, z14.s, z17.s\n"
- "fmax z12.s, p2/M, z12.s, z17.s\n"
- "fmax z10.s, p2/M, z10.s, z17.s\n"
- "fmax z8.s, p2/M, z8.s, z17.s\n"
- "fmin z14.s, p2/M, z14.s, z16.s\n"
- "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
- "fmin z12.s, p2/M, z12.s, z16.s\n"
- "fmin z10.s, p2/M, z10.s, z16.s\n"
- "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
- "fmin z8.s, p2/M, z8.s, z16.s\n"
- "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
- "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
- : [params] "+r" (params)
- : [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((unsigned long) n_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 571246be3e..a570c5aa6a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,225 +88,225 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x14, #0x0\n"
"1:" // Tile loop
- "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x2\n"
- "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x15, #0x2\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x13, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cntw x12\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x12\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x17, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x16, x11, x19\n" // offset += tile_j * ld_input_col
- "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
- "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x10, x10, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x27, x10, x22, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x26, x27, x22, LSL #2\n"
- "ld1w { z16.s }, p3/Z, [x14]\n"
- "add x25, x26, x22, LSL #2\n"
- "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
- "add x24, x11, x11\n"
- "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
- "add x23, x24, x11\n"
- "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
- "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
- "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
- "madd x19, x16, x9, x19\n" // offset += tile_j * ld_output_col
- "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
- "mul x19, x19, x15\n" // offset *= output_tile_size
- "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
- "add x28, x28, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
- "add x22, x28, x20, LSL #2\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
+ "cntw x11\n"
+ "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z9.s }, p2/Z, [x27, x11, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x10]\n"
- "addvl x14, x14, #16\n"
- "ld1w { z11.s }, p2/Z, [x10, x23, LSL #2]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
- "addvl x14, x14, #-6\n"
- "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x11, LSL #2]\n"
+ "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ld1w { z18.s }, p3/Z, [x10]\n"
+ "add x27, x13, x13\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "add x26, x9, x23, LSL #2\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "add x25, x26, x23, LSL #2\n"
+ "add x24, x27, x13\n"
+ "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x11, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "add x22, x28, x22, LSL #2\n"
+ "mov x21, #0x0\n"
+ "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "sub x20, XZR, x11\n"
+ "ld1w { z10.s }, p2/Z, [x9]\n"
+ "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "addvl x10, x10, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z4.s, z9.s\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "whilelt p1.s, x11, %x[n_channels]\n"
"incw x21\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x23]\n"
+ "incw x11\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "incw x13\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x24, LSL #2]\n"
- "incw x12\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x23, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "ld1w { z16.s }, p3/Z, [x14]\n"
+ "ld1w { z18.s }, p3/Z, [x10]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "incw x20\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26]\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x11, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x24, LSL #2]\n"
- "addvl x10, x10, #1\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x23, LSL #2]\n"
- "addvl x27, x27, #1\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26]\n"
- "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x23, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
"addvl x26, x26, #1\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "ld1w { z13.s }, p1/Z, [x26, x11, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x11, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z10.s\n"
- "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x24, LSL #2]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x25]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z31.s, p3/M, z6.s, z9.s\n"
- "ld1w { z9.s }, p1/Z, [x27, x11, LSL #2]\n"
- "cmp x12, %x[n_channels]\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p1/Z, [x10]\n"
- "fmla z28.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x10, x23, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "addvl x14, x14, #16\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x27, x24, LSL #2]\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
- "addvl x14, x14, #-6\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x28]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x28, x9, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
+ "cmp x11, %x[n_channels]\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x9]\n"
+ "ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x28]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "addvl x23, x23, #1\n"
+ "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "st1w { z30.s }, p0, [x22]\n"
"addvl x28, x28, #1\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z29.s }, p0, [x22]\n"
- "st1w { z28.s }, p0, [x22, x9, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "addvl x10, x10, #-6\n"
+ "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z4.s, z9.s\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x17, #0x1\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x23]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x20\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "add x21, x10, #0x1\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x25]\n"
- "add x16, x16, #0x1\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x23, LSL #2]\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "cmp x16, x19\n"
- "fmla z31.s, p3/M, z5.s, z12.s\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "csel x16, x16, XZR, LT\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "csel x10, x10, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26]\n"
+ "csel x14, x14, XZR, LT\n"
"fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x11, LSL #2]\n"
- "csel x17, x17, x21, LT\n"
- "fmla z29.s, p3/M, z6.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x24, LSL #2]\n"
- "cmp x17, x20\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x23, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x23, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x11, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z9.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "fmla z28.s, p3/M, z6.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x28]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x28, x9, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z29.s }, p0, [x22]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z28.s }, p0, [x22, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "cmp x10, x20\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x25]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "st1w { z28.s }, p0, [x28]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z29.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x12, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 77a6c683b0..903de0d309 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,215 +78,215 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cntw x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cntw x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z16.s }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldr x22, [x14, #0x20]\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x16]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z4.s, z9.s\n"
- "ldr x21, [x14, #0x28]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x14, #0x30]\n"
- "incw x9\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x19, [x14, #0x38]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ldr x26, [x14, #0x40]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x14, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "ldr x24, [x14, #0x50]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x14, #0x58]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z16.s }, p3/Z, [x15]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ldr x22, [x14, #0x20]\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z8.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z30.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "ld1w { z13.s }, p1/Z, [x22, x12, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z10.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "incw x13\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z9.s\n"
- "ld1w { z9.s }, p1/Z, [x26, x12, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x24, x12, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p1/Z, [x25, x12, LSL #2]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "addvl x15, x15, #16\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "ld1w { z12.s }, p1/Z, [x23, x12, LSL #2]\n"
- "incw x12\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "incw x28\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "mov p0.b, p2.b\n"
+ "ld1w { z18.s }, p3/Z, [x16]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "incw x9\n"
+ "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "incw x14\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z4.s, z9.s\n"
- "ldr x21, [x14, #0x28]\n"
- "incw x9\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z1.s, z9.s\n"
- "ldr x19, [x14, #0x38]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ldr x26, [x14, #0x40]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x14, #0x48]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldr x24, [x14, #0x50]\n"
- "fmla z31.s, p3/M, z5.s, z12.s\n"
- "ldr x23, [x14, #0x58]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z9.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.s, p3/M, z7.s, z13.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.s, p3/M, z6.s, z13.s\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ldr x19, [x14, #0x78]\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z9.s\n"
- "fmla z28.s, p3/M, z6.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "incw x28\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp
deleted file mode 100644
index 65cb735bde..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <cstdint>
-
-#pragma once
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-
-namespace arm_conv {
-namespace depthwise {
-
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
-
-struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided
-{
- typedef float bias_type;
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- constexpr static unsigned int kernel_rows = 3;
- constexpr static unsigned int kernel_cols = 3;
-
- constexpr static unsigned int stride_rows = 1;
- constexpr static unsigned int stride_cols = 1;
-
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- kern_type kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl;
-
- sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided(const CPUInfo *) {}
-};
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp
deleted file mode 100644
index 97c4d88119..0000000000
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <cstddef>
-#include <cstdint>
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-
-namespace arm_conv {
-namespace depthwise {
-
-void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(
- const float *const inptr,
- const size_t in_row_stride,
- const size_t in_col_stride,
- float *const outptr,
- const size_t out_row_stride,
- const size_t out_col_stride,
- const void *params,
- unsigned long n_channels,
- const float activation_min,
- const float activation_max
-)
-{
- const float minmax_vals[2] = { activation_min, activation_max };
-
- __asm__ __volatile__(
- "ptrue p2.b\n"
- "ld1w { z15.s }, p2/Z, [%x[params]]\n"
- "mov z14.d, z15.d\n"
- "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "whilelt p1.s, XZR, %x[n_channels]\n"
- "mov z12.d, z15.d\n"
- "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "mov x26, %x[inptr]\n"
- "mov z10.d, z15.d\n"
- "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "add x25, x26, %x[in_row_stride], LSL #2\n"
- "mov z8.d, z15.d\n"
- "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "add x24, x25, %x[in_row_stride], LSL #2\n"
- "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "add x23, x24, %x[in_row_stride], LSL #2\n"
- "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "mov x22, %x[outptr]\n"
- "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "add x21, x22, %x[out_row_stride], LSL #2\n"
- "ld1w { z3.s }, p1/Z, [x26]\n"
- "add x20, %x[in_col_stride], %x[in_col_stride]\n"
- "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
- "add x19, x20, %x[in_col_stride]\n"
- "ld1w { z1.s }, p1/Z, [x25]\n"
- "addvl %x[params], %x[params], #16\n"
- "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
- "decw %x[n_channels]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "cmp %x[n_channels], XZR\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "addvl %x[params], %x[params], #-6\n"
- "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x24]\n"
- "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
- "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x23]\n"
- "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
- "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
- "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
- "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
- "ble 2f\n"
- "1:" // Loop
- "fmla z14.s, p2/M, z13.s, z3.s\n"
- "ld1w { z15.s }, p2/Z, [%x[params]]\n"
- "addvl x26, x26, #1\n"
- "fmla z12.s, p2/M, z13.s, z2.s\n"
- "addvl x25, x25, #1\n"
- "fmla z10.s, p2/M, z13.s, z1.s\n"
- "addvl x24, x24, #1\n"
- "fmla z8.s, p2/M, z13.s, z0.s\n"
- "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "addvl x23, x23, #1\n"
- "fmla z14.s, p2/M, z11.s, z2.s\n"
- "decw %x[n_channels]\n"
- "mov p0.b, p1.b\n"
- "fmla z12.s, p2/M, z11.s, z29.s\n"
- "fmla z10.s, p2/M, z11.s, z0.s\n"
- "whilelt p1.s, XZR, %x[n_channels]\n"
- "ld1w { z3.s }, p1/Z, [x26]\n"
- "fmla z8.s, p2/M, z11.s, z28.s\n"
- "cmp %x[n_channels], XZR\n"
- "fmla z14.s, p2/M, z9.s, z29.s\n"
- "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
- "fmla z12.s, p2/M, z9.s, z27.s\n"
- "fmla z10.s, p2/M, z9.s, z28.s\n"
- "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
- "fmla z8.s, p2/M, z9.s, z26.s\n"
- "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "fmla z14.s, p2/M, z7.s, z1.s\n"
- "ld1w { z1.s }, p1/Z, [x25]\n"
- "fmla z12.s, p2/M, z7.s, z0.s\n"
- "fmla z10.s, p2/M, z7.s, z25.s\n"
- "fmla z8.s, p2/M, z7.s, z24.s\n"
- "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "fmla z14.s, p2/M, z6.s, z0.s\n"
- "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
- "fmla z12.s, p2/M, z6.s, z28.s\n"
- "fmla z10.s, p2/M, z6.s, z24.s\n"
- "fmla z8.s, p2/M, z6.s, z23.s\n"
- "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z14.s, p2/M, z5.s, z28.s\n"
- "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
- "fmla z12.s, p2/M, z5.s, z26.s\n"
- "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
- "fmla z10.s, p2/M, z5.s, z23.s\n"
- "fmla z8.s, p2/M, z5.s, z22.s\n"
- "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmla z14.s, p2/M, z4.s, z25.s\n"
- "ld1w { z25.s }, p1/Z, [x24]\n"
- "fmla z12.s, p2/M, z4.s, z24.s\n"
- "fmla z10.s, p2/M, z4.s, z21.s\n"
- "ld1w { z21.s }, p1/Z, [x23]\n"
- "fmla z8.s, p2/M, z4.s, z20.s\n"
- "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "fmla z14.s, p2/M, z31.s, z24.s\n"
- "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
- "fmla z12.s, p2/M, z31.s, z23.s\n"
- "fmla z10.s, p2/M, z31.s, z20.s\n"
- "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
- "fmla z8.s, p2/M, z31.s, z19.s\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "fmla z14.s, p2/M, z30.s, z23.s\n"
- "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
- "fmla z12.s, p2/M, z30.s, z22.s\n"
- "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
- "fmla z10.s, p2/M, z30.s, z19.s\n"
- "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
- "fmla z8.s, p2/M, z30.s, z18.s\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "addvl %x[params], %x[params], #-6\n"
- "fmax z14.s, p2/M, z14.s, z17.s\n"
- "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
- "fmax z12.s, p2/M, z12.s, z17.s\n"
- "fmax z10.s, p2/M, z10.s, z17.s\n"
- "fmax z8.s, p2/M, z8.s, z17.s\n"
- "fmin z14.s, p2/M, z14.s, z16.s\n"
- "st1w { z14.s }, p0, [x22]\n"
- "mov z14.d, z15.d\n"
- "fmin z12.s, p2/M, z12.s, z16.s\n"
- "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
- "mov z12.d, z15.d\n"
- "addvl x22, x22, #1\n"
- "fmin z10.s, p2/M, z10.s, z16.s\n"
- "st1w { z10.s }, p0, [x21]\n"
- "mov z10.d, z15.d\n"
- "fmin z8.s, p2/M, z8.s, z16.s\n"
- "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
- "mov z8.d, z15.d\n"
- "addvl x21, x21, #1\n"
- "bgt 1b\n"
- "2:" // Tail
- "fmla z14.s, p2/M, z13.s, z3.s\n"
- "mov p0.b, p1.b\n"
- "fmla z12.s, p2/M, z13.s, z2.s\n"
- "fmla z10.s, p2/M, z13.s, z1.s\n"
- "fmla z8.s, p2/M, z13.s, z0.s\n"
- "fmla z14.s, p2/M, z11.s, z2.s\n"
- "fmla z12.s, p2/M, z11.s, z29.s\n"
- "fmla z10.s, p2/M, z11.s, z0.s\n"
- "fmla z8.s, p2/M, z11.s, z28.s\n"
- "fmla z14.s, p2/M, z9.s, z29.s\n"
- "fmla z12.s, p2/M, z9.s, z27.s\n"
- "fmla z10.s, p2/M, z9.s, z28.s\n"
- "fmla z8.s, p2/M, z9.s, z26.s\n"
- "fmla z14.s, p2/M, z7.s, z1.s\n"
- "fmla z12.s, p2/M, z7.s, z0.s\n"
- "fmla z10.s, p2/M, z7.s, z25.s\n"
- "fmla z8.s, p2/M, z7.s, z24.s\n"
- "fmla z14.s, p2/M, z6.s, z0.s\n"
- "fmla z12.s, p2/M, z6.s, z28.s\n"
- "fmla z10.s, p2/M, z6.s, z24.s\n"
- "fmla z8.s, p2/M, z6.s, z23.s\n"
- "fmla z14.s, p2/M, z5.s, z28.s\n"
- "fmla z12.s, p2/M, z5.s, z26.s\n"
- "fmla z10.s, p2/M, z5.s, z23.s\n"
- "fmla z8.s, p2/M, z5.s, z22.s\n"
- "fmla z14.s, p2/M, z4.s, z25.s\n"
- "fmla z12.s, p2/M, z4.s, z24.s\n"
- "fmla z10.s, p2/M, z4.s, z21.s\n"
- "fmla z8.s, p2/M, z4.s, z20.s\n"
- "fmla z14.s, p2/M, z31.s, z24.s\n"
- "fmla z12.s, p2/M, z31.s, z23.s\n"
- "fmla z10.s, p2/M, z31.s, z20.s\n"
- "fmla z8.s, p2/M, z31.s, z19.s\n"
- "fmla z14.s, p2/M, z30.s, z23.s\n"
- "fmla z12.s, p2/M, z30.s, z22.s\n"
- "fmla z10.s, p2/M, z30.s, z19.s\n"
- "fmla z8.s, p2/M, z30.s, z18.s\n"
- "fmax z14.s, p2/M, z14.s, z17.s\n"
- "fmax z12.s, p2/M, z12.s, z17.s\n"
- "fmax z10.s, p2/M, z10.s, z17.s\n"
- "fmax z8.s, p2/M, z8.s, z17.s\n"
- "fmin z14.s, p2/M, z14.s, z16.s\n"
- "st1w { z14.s }, p0, [x22]\n"
- "fmin z12.s, p2/M, z12.s, z16.s\n"
- "fmin z10.s, p2/M, z10.s, z16.s\n"
- "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
- "fmin z8.s, p2/M, z8.s, z16.s\n"
- "st1w { z10.s }, p0, [x21]\n"
- "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
- : [n_channels] "+r" (n_channels), [params] "+r" (params)
- : [in_col_stride] "r" (in_col_stride), [in_row_stride] "r" (in_row_stride), [inptr] "r" (inptr), [minmax_vals] "r" (minmax_vals), [out_col_stride] "r" (out_col_stride), [out_row_stride] "r" (out_row_stride), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
- );
-}
-
-} // namespace depthwise
-} // namespace arm_conv
-
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index c485b7dde0..cda34358f5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,369 +88,369 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x6, #0x0\n"
- "mov x7, #0x0\n"
+ "mov x13, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x3\n"
"mov x24, #0x3\n"
- "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x23, #0x3\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x17, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cntw x16\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x16\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x15\n"
+ "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
"ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x6, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x7, x15, x19\n" // offset += tile_j * ld_input_col
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
- "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x14, x14, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x11, x14, x22, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x10, x11, x22, LSL #2\n"
- "ld1w { z16.s }, p3/Z, [x8]\n"
- "add x9, x10, x22, LSL #2\n"
- "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
- "add x28, x9, x22, LSL #2\n"
- "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
- "add x27, x15, x15\n"
- "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
- "add x26, x27, x15\n"
- "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
- "add x25, x26, x15\n"
- "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
- "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
- "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
- "madd x19, x7, x13, x19\n" // offset += tile_j * ld_output_col
- "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
- "mul x19, x19, x23\n" // offset *= output_tile_size
- "add x24, x13, x13\n"
- "add x12, x12, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x23, x12, x20, LSL #2\n"
- "add x22, x23, x20, LSL #2\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x12, x17, x17\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x14, x23, LSL #2\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x10, x23, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z9.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x13]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+ "add x28, x9, x23, LSL #2\n"
+ "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+ "add x27, x12, x17\n"
+ "add x11, x11, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+ "add x26, x28, x23, LSL #2\n"
+ "add x25, x27, x17\n"
+ "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "add x24, x11, x21, LSL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "add x23, x24, x21, LSL #2\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+ "add x22, x16, x16\n"
+ "mov x21, #0x0\n"
+ "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
+ "sub x20, XZR, x15\n"
"ld1w { z10.s }, p2/Z, [x14]\n"
- "addvl x8, x8, #16\n"
"ld1w { z11.s }, p2/Z, [x14, x25, LSL #2]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "addvl x8, x8, #-6\n"
- "ld1w { z12.s }, p2/Z, [x28]\n"
- "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "addvl x13, x13, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x26]\n"
+ "ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x16, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z7.s, z9.s\n"
+ "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x15, %x[n_channels]\n"
"incw x21\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "incw x15\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "incw x17\n"
- "movprfx z27, z16\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "incw x16\n"
- "movprfx z26, z16\n fmla z26.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z16\n fmla z25.s, p3/M, z2.s, z9.s\n"
- "movprfx z24, z16\n fmla z24.s, p3/M, z1.s, z9.s\n"
- "movprfx z23, z16\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "ld1w { z16.s }, p3/Z, [x8]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x26, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "incw x20\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z26.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z18.s }, p3/Z, [x13]\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x11, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z2.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z24.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z26.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x26, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
- "addvl x11, x11, #1\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "addvl x10, x10, #1\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
"addvl x14, x14, #1\n"
- "fmla z24.s, p3/M, z8.s, z13.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
"ld1w { z10.s }, p1/Z, [x14]\n"
- "fmla z23.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
- "addvl x9, x9, #1\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10]\n"
+ "ld1w { z12.s }, p2/Z, [x9]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "addvl x9, x9, #1\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
- "addvl x10, x10, #1\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "ld1w { z9.s }, p1/Z, [x10, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "whilelt p2.s, x17, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
- "addvl x28, x28, #1\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x28]\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z17.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmax z26.s, p3/M, z26.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z13.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "addvl x26, x26, #1\n"
+ "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
"ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
- "fmla z24.s, p3/M, z7.s, z13.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z25.s, p3/M, z8.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmla z23.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p1/Z, [x11, x27, LSL #2]\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "addvl x8, x8, #-6\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z31.s }, p0, [x12]\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z18.s\n"
- "fmax z26.s, p3/M, z26.s, z18.s\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "addvl x12, x12, #1\n"
- "fmax z25.s, p3/M, z25.s, z18.s\n"
- "st1w { z28.s }, p0, [x23]\n"
- "fmin z27.s, p3/M, z27.s, z17.s\n"
- "fmin z26.s, p3/M, z26.s, z17.s\n"
- "st1w { z27.s }, p0, [x23, x13, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z17.s\n"
- "fmax z24.s, p3/M, z24.s, z18.s\n"
- "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x26]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
+ "st1w { z23.s }, p0, [x11]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "addvl x13, x13, #-6\n"
+ "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
- "fmax z23.s, p3/M, z23.s, z18.s\n"
- "st1w { z25.s }, p0, [x22]\n"
- "fmin z24.s, p3/M, z24.s, z17.s\n"
- "fmin z23.s, p3/M, z23.s, z17.s\n"
- "st1w { z24.s }, p0, [x22, x13, LSL #2]\n"
- "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x6, #0x1\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x8, x8, #0x1\n"
+ "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "cmp x8, x20\n"
+ "add x21, x13, #0x1\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x7, x7, #0x1\n"
- "movprfx z27, z16\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "cmp x7, x19\n"
- "movprfx z26, z16\n fmla z26.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z16\n fmla z25.s, p3/M, z2.s, z9.s\n"
- "csel x7, x7, XZR, LT\n"
- "movprfx z24, z16\n fmla z24.s, p3/M, z1.s, z9.s\n"
- "csel x6, x6, x21, LT\n"
- "movprfx z23, z16\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "cmp x6, x20\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x26, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "csel x13, x13, x21, LT\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "mov p0.b, p2.b\n"
+ "csel x8, x8, XZR, LT\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "cmp x13, x20\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "fmla z26.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x11, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9]\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z2.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z24.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z26.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x26, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z13.s\n"
- "fmla z23.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10]\n"
+ "ld1w { z12.s }, p2/Z, [x9]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z25.s, p3/M, z8.s, z13.s\n"
- "fmla z24.s, p3/M, z7.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmla z23.s, p3/M, z6.s, z13.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x12]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmax z27.s, p3/M, z27.s, z18.s\n"
- "st1w { z29.s }, p0, [x12, x24, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z18.s\n"
- "fmax z25.s, p3/M, z25.s, z18.s\n"
- "fmax z24.s, p3/M, z24.s, z18.s\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z28.s }, p0, [x23]\n"
- "fmin z27.s, p3/M, z27.s, z17.s\n"
- "fmin z26.s, p3/M, z26.s, z17.s\n"
- "st1w { z27.s }, p0, [x23, x13, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z17.s\n"
- "fmin z24.s, p3/M, z24.s, z17.s\n"
- "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z18.s\n"
- "st1w { z25.s }, p0, [x22]\n"
- "fmin z23.s, p3/M, z23.s, z17.s\n"
- "st1w { z24.s }, p0, [x22, x13, LSL #2]\n"
- "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z17.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmax z26.s, p3/M, z26.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z13.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z23.s }, p0, [x11]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z24.s }, p0, [x11, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z25.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z27.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 72b182679d..2eed8cb0c4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,387 +87,387 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cntw x12\n"
- "ld1w { z16.s }, p3/Z, [x15]\n"
- "sub x11, XZR, x12\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z18.s }, p3/Z, [x17]\n"
+ "cntw x15\n"
+ "mov x14, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1w { z9.s }, p2/Z, [x10, x13, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "ldr x26, [x14, #0x20]\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "sub x13, XZR, x15\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "ldr x28, [x16, #0x20]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x28, x14, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x25, [x14, #0x28]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x24, [x14, #0x30]\n"
- "incw x11\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x23, [x14, #0x38]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z27, z16\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z26, z16\n fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z25, z16\n fmla z25.s, p3/M, z2.s, z9.s\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z24, z16\n fmla z24.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x14, #0x60]\n"
- "movprfx z23, z16\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x21, [x16, #0x8]\n"
+ "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x16, #0x30]\n"
+ "ldr x25, [x16, #0x38]\n"
+ "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ldr x24, [x16, #0x28]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x28, [x16, #0x60]\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "incw x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z26.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z30.s, p3/M, z6.s, z11.s\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "ldr x19, [x16, #0x18]\n"
+ "ldr x23, [x27, #0x0]\n"
+ "whilelt p0.s, x15, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "ldr x24, [x16, #0x68]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "ld1w { z16.s }, p3/Z, [x15]\n"
- "fmla z25.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x14, #0x98]\n"
+ "ldr x22, [x27, #0x8]\n"
+ "ldr x21, [x27, #0x10]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ldr x20, [x27, #0x18]\n"
+ "ld1w { z18.s }, p3/Z, [x17]\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z24.s, p3/M, z2.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z3.s, z12.s\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "ldr x28, [x16, #0xa0]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z24.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z26.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z13.s\n"
- "ldr x26, [x14, #0x20]\n"
- "fmla z23.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ldr x28, [x16, #0x20]\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x27, #0x20]\n"
+ "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x13, LSL #2]\n"
- "incw x13\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "ld1w { z9.s }, p1/Z, [x10, x12, LSL #2]\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "ld1w { z10.s }, p1/Z, [x9, x12, LSL #2]\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x28, x12, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z13.s\n"
- "ld1w { z12.s }, p1/Z, [x27, x12, LSL #2]\n"
- "fmla z24.s, p3/M, z7.s, z13.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z23.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p1/Z, [x26, x12, LSL #2]\n"
- "incw x12\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmax z27.s, p3/M, z27.s, z18.s\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "ldr x22, [x16, #0x20]\n"
- "fmax z26.s, p3/M, z26.s, z18.s\n"
- "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "fmin z27.s, p3/M, z27.s, z17.s\n"
- "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
- "fmin z26.s, p3/M, z26.s, z17.s\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z25.s, p3/M, z25.s, z18.s\n"
- "ldr x20, [x16, #0x30]\n"
- "fmax z24.s, p3/M, z24.s, z18.s\n"
- "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z18.s\n"
- "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
- "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z17.s\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z24.s, p3/M, z24.s, z17.s\n"
- "ldr x22, [x16, #0x40]\n"
- "fmin z23.s, p3/M, z23.s, z17.s\n"
- "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
- "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
- "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "fmax z25.s, p3/M, z25.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmax z26.s, p3/M, z26.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z13.s\n"
+ "incw x14\n"
+ "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
+ "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "ld1w { z13.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "incw x15\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x22, [x27, #0x28]\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x21, [x27, #0x30]\n"
+ "ldr x20, [x27, #0x38]\n"
+ "ldr x23, [x27, #0x40]\n"
+ "whilelt p2.s, x14, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x25, [x14, #0x28]\n"
- "incw x11\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x24, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z27, z16\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z26, z16\n fmla z26.s, p3/M, z3.s, z9.s\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z25, z16\n fmla z25.s, p3/M, z2.s, z9.s\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z24, z16\n fmla z24.s, p3/M, z1.s, z9.s\n"
- "ldr x26, [x14, #0x60]\n"
- "movprfx z23, z16\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z28.s, p3/M, z2.s, z13.s\n"
- "ldr x21, [x16, #0x8]\n"
+ "movprfx z23, z18\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z18\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x16, #0x30]\n"
+ "ldr x25, [x16, #0x38]\n"
+ "movprfx z25, z18\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ldr x24, [x16, #0x28]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z18\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "movprfx z27, z18\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z18\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "movprfx z29, z18\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x28, [x16, #0x60]\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "movprfx z31, z18\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "incw x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z26.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z30.s, p3/M, z6.s, z11.s\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "ldr x19, [x16, #0x18]\n"
+ "ldr x23, [x27, #0x0]\n"
+ "ldr x22, [x27, #0x8]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "movprfx z30, z18\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "ldr x24, [x16, #0x68]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z25.s, p3/M, z1.s, z11.s\n"
- "fmla z24.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x27, [x14, #0x98]\n"
+ "ldr x21, [x27, #0x10]\n"
+ "ldr x20, [x27, #0x18]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z24.s, p3/M, z2.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z3.s, z12.s\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "ldr x28, [x16, #0xa0]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z10.s\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z24.s, p3/M, z4.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z26.s, p3/M, z8.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
"fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z11.s\n"
- "fmla z30.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
"fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z13.s\n"
- "fmla z23.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z17.s\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z23.s }, p1, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x27, #0x20]\n"
+ "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
"fmla z27.s, p3/M, z8.s, z13.s\n"
- "fmla z26.s, p3/M, z7.s, z13.s\n"
- "fmla z24.s, p3/M, z5.s, z13.s\n"
- "fmla z23.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z23.s, p3/M, z2.s, z11.s\n"
- "fmla z25.s, p3/M, z8.s, z13.s\n"
- "fmla z24.s, p3/M, z7.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmla z23.s, p3/M, z6.s, z13.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "ldr x22, [x16, #0x20]\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z18.s\n"
- "fmax z26.s, p3/M, z26.s, z18.s\n"
- "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z25.s, p3/M, z25.s, z18.s\n"
- "ldr x20, [x16, #0x30]\n"
- "fmax z24.s, p3/M, z24.s, z18.s\n"
- "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z17.s\n"
- "fmin z26.s, p3/M, z26.s, z17.s\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z25.s, p3/M, z25.s, z17.s\n"
- "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z17.s\n"
- "fmax z23.s, p3/M, z23.s, z18.s\n"
- "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
- "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z17.s\n"
- "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
- "ldr x22, [x16, #0x40]\n"
- "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z13.s\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z25.s }, p1, [x21, x13, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z17.s\n"
+ "fmax z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x22, [x27, #0x28]\n"
+ "fmax z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z27.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x21, [x27, #0x30]\n"
+ "ldr x20, [x27, #0x38]\n"
+ "ldr x23, [x27, #0x40]\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "st1w { z28.s }, p1, [x22, x13, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z29.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z30.s }, p1, [x20, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x23, x13, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 84b4b3b72b..cdf77a1cf0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,565 +88,565 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x2, #0x0\n"
- "mov x3, #0x0\n"
+ "mov x16, #0x0\n"
+ "mov x4, #0x0\n"
"1:" // Tile loop
- "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
"mov x24, #0x4\n"
- "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x23, #0x4\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x5, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cntw x6\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x21, XZR, x6\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
+ "add x7, x5, x5\n"
+ "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
"ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
- "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x15, x8, x22, LSL #2\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x14, x15, x22, LSL #2\n"
- "ld1w { z13.s }, p3/Z, [x4]\n"
- "add x13, x14, x22, LSL #2\n"
- "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
- "add x12, x13, x22, LSL #2\n"
- "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
- "add x11, x12, x22, LSL #2\n"
- "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
- "add x10, x7, x7\n"
- "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
- "add x9, x10, x7\n"
- "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
- "add x28, x9, x7\n"
- "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
- "add x27, x28, x7\n"
- "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
- "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
- "add x26, x17, x17\n"
- "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x23\n" // offset *= output_tile_size
- "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x25, x16, x20, LSL #2\n"
- "add x24, x25, x20, LSL #2\n"
- "add x23, x24, x20, LSL #2\n"
- "add x22, x26, x17\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "cntw x16\n"
+ "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x7, x5\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z9.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x8, x23, LSL #2\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "add x12, x13, x23, LSL #2\n"
+ "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "add x11, x12, x23, LSL #2\n"
+ "add x10, x14, x5\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "add x9, x15, x22, LSL #2\n"
+ "add x28, x11, x23, LSL #2\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "add x27, x10, x5\n"
+ "add x26, x9, x22, LSL #2\n"
+ "add x25, x6, x6\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x16, %x[n_channels]\n"
+ "add x24, x28, x23, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "add x23, x26, x22, LSL #2\n"
+ "add x22, x25, x6\n"
+ "ld1w { z9.s }, p2/Z, [x12, x7, LSL #2]\n"
"ld1w { z10.s }, p2/Z, [x8]\n"
- "addvl x4, x4, #16\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x16\n"
"ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
- "cmp x6, %x[n_channels]\n"
- "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
- "addvl x4, x4, #-6\n"
- "ld1w { z12.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "addvl x17, x17, #-6\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z13\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x6, %x[n_channels]\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z7.s, z9.s\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
"incw x21\n"
- "movprfx z29, z13\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "incw x16\n"
"mov p0.b, p2.b\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "incw x5\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z4.s, z9.s\n"
- "incw x6\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z3.s, z9.s\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z9.s\n"
- "movprfx z22, z13\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "movprfx z21, z13\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z26.s, p3/M, z5.s, z12.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z2.s, z12.s\n"
- "fmla z21.s, p3/M, z1.s, z12.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
- "movprfx z19, z13\n fmla z19.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
- "fmla z26.s, p3/M, z7.s, z9.s\n"
- "fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "fmla z22.s, p3/M, z4.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z9.s\n"
- "fmla z19.s, p3/M, z2.s, z9.s\n"
- "movprfx z18, z13\n fmla z18.s, p3/M, z1.s, z9.s\n"
- "movprfx z17, z13\n fmla z17.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x15]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z13.s }, p3/Z, [x4]\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z10.s\n"
- "fmla z22.s, p3/M, z5.s, z10.s\n"
- "fmla z21.s, p3/M, z4.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "incw x20\n"
+ "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z11.s\n"
- "fmla z19.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z2.s, z10.s\n"
- "fmla z26.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z11.s\n"
- "fmla z16.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "fmla z19.s, p3/M, z7.s, z11.s\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
- "fmla z26.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x24]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "fmla z16.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28]\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
"fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z17.s, p3/M, z8.s, z11.s\n"
- "fmla z16.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z10.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z1.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z11.s\n"
+ "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
"addvl x8, x8, #1\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14]\n"
- "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z16.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z20.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z22.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "fmla z18.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "addvl x14, x14, #1\n"
- "fmla z31.s, p3/M, z6.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x14, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13]\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z24.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z5.s, z11.s\n"
- "fmla z20.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z11.s\n"
- "fmla z16.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "addvl x24, x24, #1\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
"addvl x13, x13, #1\n"
- "fmla z27.s, p3/M, z6.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z19.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z11.s\n"
- "fmla z19.s, p3/M, z5.s, z11.s\n"
- "fmla z18.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z12.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
- "fmla z16.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "addvl x11, x11, #1\n"
- "fmla z19.s, p3/M, z8.s, z10.s\n"
- "fmla z18.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z21.s, p3/M, z7.s, z11.s\n"
- "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmax z16.s, p3/M, z16.s, z14.s\n"
+ "fmax z17.s, p3/M, z17.s, z14.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z20.s, p3/M, z20.s, z14.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z17.s, p3/M, z4.s, z11.s\n"
- "fmla z16.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
- "addvl x15, x15, #1\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z14.s\n"
+ "fmax z19.s, p3/M, z19.s, z14.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
+ "fmax z22.s, p3/M, z22.s, z14.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z16.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
- "whilelt p2.s, x5, %x[n_channels]\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
- "addvl x12, x12, #1\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "cmp x6, %x[n_channels]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
- "fmla z19.s, p3/M, z4.s, z12.s\n"
- "fmla z18.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x14, x9, LSL #2]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z10.s\n"
- "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
+ "fmax z31.s, p3/M, z31.s, z14.s\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z16.s, p3/M, z16.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "fmin z18.s, p3/M, z18.s, z13.s\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "fmin z19.s, p3/M, z19.s, z13.s\n"
+ "fmin z20.s, p3/M, z20.s, z13.s\n"
+ "fmin z21.s, p3/M, z21.s, z13.s\n"
+ "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
"ld1w { z10.s }, p1/Z, [x8]\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
- "addvl x4, x4, #16\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
- "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
- "addvl x4, x4, #-6\n"
- "fmin z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z31.s }, p0, [x16]\n"
- "fmin z30.s, p3/M, z30.s, z14.s\n"
- "fmin z29.s, p3/M, z29.s, z14.s\n"
- "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "st1w { z29.s }, p0, [x16, x26, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "fmin z28.s, p3/M, z28.s, z14.s\n"
- "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z14.s\n"
- "addvl x16, x16, #1\n"
- "fmin z26.s, p3/M, z26.s, z14.s\n"
- "st1w { z27.s }, p0, [x25]\n"
- "fmin z25.s, p3/M, z25.s, z14.s\n"
- "fmin z24.s, p3/M, z24.s, z14.s\n"
- "st1w { z26.s }, p0, [x25, x17, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "st1w { z25.s }, p0, [x25, x26, LSL #2]\n"
- "fmax z22.s, p3/M, z22.s, z15.s\n"
- "fmax z21.s, p3/M, z21.s, z15.s\n"
- "st1w { z24.s }, p0, [x25, x22, LSL #2]\n"
- "addvl x25, x25, #1\n"
- "fmin z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z23.s }, p0, [x24]\n"
- "fmin z22.s, p3/M, z22.s, z14.s\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "st1w { z22.s }, p0, [x24, x17, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "fmax z19.s, p3/M, z19.s, z15.s\n"
- "st1w { z21.s }, p0, [x24, x26, LSL #2]\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "fmax z16.s, p3/M, z16.s, z15.s\n"
- "fmin z20.s, p3/M, z20.s, z14.s\n"
- "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
- "fmin z19.s, p3/M, z19.s, z14.s\n"
- "addvl x24, x24, #1\n"
- "fmin z18.s, p3/M, z18.s, z14.s\n"
- "st1w { z19.s }, p0, [x23]\n"
- "fmin z17.s, p3/M, z17.s, z14.s\n"
- "fmin z16.s, p3/M, z16.s, z14.s\n"
- "st1w { z18.s }, p0, [x23, x17, LSL #2]\n"
- "st1w { z17.s }, p0, [x23, x26, LSL #2]\n"
- "st1w { z16.s }, p0, [x23, x22, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z13.s\n"
+ "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z13.s\n"
+ "fmin z25.s, p3/M, z25.s, z13.s\n"
+ "st1w { z16.s }, p0, [x15]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "fmin z26.s, p3/M, z26.s, z13.s\n"
+ "fmin z27.s, p3/M, z27.s, z13.s\n"
+ "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "fmin z28.s, p3/M, z28.s, z13.s\n"
+ "fmin z29.s, p3/M, z29.s, z13.s\n"
+ "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z13.s\n"
+ "fmin z31.s, p3/M, z31.s, z13.s\n"
+ "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "st1w { z20.s }, p0, [x9]\n"
+ "addvl x15, x15, #1\n"
+ "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
+ "addvl x17, x17, #-6\n"
+ "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
+ "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z24.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z13\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x2, #0x1\n"
- "movprfx z29, z13\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x4, x4, #0x1\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "cmp x4, x20\n"
+ "add x21, x16, #0x1\n"
+ "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x3, x3, #0x1\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z4.s, z9.s\n"
- "cmp x3, x19\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z3.s, z9.s\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z9.s\n"
- "csel x3, x3, XZR, LT\n"
- "movprfx z22, z13\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "csel x2, x2, x21, LT\n"
- "movprfx z21, z13\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
- "cmp x2, x20\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "fmla z26.s, p3/M, z5.s, z12.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z2.s, z12.s\n"
- "fmla z21.s, p3/M, z1.s, z12.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
- "movprfx z19, z13\n fmla z19.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
- "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
- "fmla z26.s, p3/M, z7.s, z9.s\n"
- "fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "fmla z22.s, p3/M, z4.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z9.s\n"
- "fmla z19.s, p3/M, z2.s, z9.s\n"
- "movprfx z18, z13\n fmla z18.s, p3/M, z1.s, z9.s\n"
- "movprfx z17, z13\n fmla z17.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x15]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z10.s\n"
- "fmla z22.s, p3/M, z5.s, z10.s\n"
- "fmla z21.s, p3/M, z4.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "csel x16, x16, x21, LT\n"
+ "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "mov p0.b, p2.b\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z11.s\n"
- "fmla z19.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z2.s, z10.s\n"
- "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x24]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "csel x4, x4, XZR, LT\n"
+ "cmp x16, x20\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "fmla z16.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28]\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z1.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z11.s\n"
+ "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
"fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z11.s\n"
- "fmla z16.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
- "fmla z19.s, p3/M, z7.s, z11.s\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z16.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
"fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
- "fmla z26.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z17.s, p3/M, z8.s, z11.s\n"
- "fmla z16.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14]\n"
- "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z11.s\n"
"fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z22.s, p3/M, z3.s, z11.s\n"
- "fmla z19.s, p3/M, z1.s, z11.s\n"
- "fmla z18.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13]\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x5, LSL #2]\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z24.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z5.s, z11.s\n"
- "fmla z20.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z11.s\n"
- "fmla z16.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z12.s\n"
- "fmla z24.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z19.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z11.s\n"
- "fmla z19.s, p3/M, z5.s, z11.s\n"
- "fmla z18.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z12.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
- "fmla z16.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
- "fmla z19.s, p3/M, z8.s, z10.s\n"
- "fmla z18.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z21.s, p3/M, z7.s, z11.s\n"
- "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmax z16.s, p3/M, z16.s, z14.s\n"
+ "fmax z17.s, p3/M, z17.s, z14.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z20.s, p3/M, z20.s, z14.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z17.s, p3/M, z4.s, z11.s\n"
- "fmla z16.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z14.s\n"
+ "fmax z19.s, p3/M, z19.s, z14.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
+ "fmax z22.s, p3/M, z22.s, z14.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z16.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "fmla z19.s, p3/M, z4.s, z12.s\n"
- "fmla z18.s, p3/M, z3.s, z12.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z10.s\n"
- "fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmin z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z31.s }, p0, [x16]\n"
- "fmin z30.s, p3/M, z30.s, z14.s\n"
- "fmin z29.s, p3/M, z29.s, z14.s\n"
- "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z14.s\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "st1w { z29.s }, p0, [x16, x26, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z14.s\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "st1w { z27.s }, p0, [x25]\n"
- "fmin z26.s, p3/M, z26.s, z14.s\n"
- "fmin z25.s, p3/M, z25.s, z14.s\n"
- "st1w { z26.s }, p0, [x25, x17, LSL #2]\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "st1w { z25.s }, p0, [x25, x26, LSL #2]\n"
- "fmax z22.s, p3/M, z22.s, z15.s\n"
- "fmax z21.s, p3/M, z21.s, z15.s\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "fmin z24.s, p3/M, z24.s, z14.s\n"
- "st1w { z24.s }, p0, [x25, x22, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z14.s\n"
- "fmin z22.s, p3/M, z22.s, z14.s\n"
- "st1w { z23.s }, p0, [x24]\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "fmin z20.s, p3/M, z20.s, z14.s\n"
- "st1w { z22.s }, p0, [x24, x17, LSL #2]\n"
- "fmax z19.s, p3/M, z19.s, z15.s\n"
- "st1w { z21.s }, p0, [x24, x26, LSL #2]\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
- "fmin z19.s, p3/M, z19.s, z14.s\n"
- "st1w { z19.s }, p0, [x23]\n"
- "fmin z18.s, p3/M, z18.s, z14.s\n"
- "fmin z17.s, p3/M, z17.s, z14.s\n"
- "st1w { z18.s }, p0, [x23, x17, LSL #2]\n"
- "fmax z16.s, p3/M, z16.s, z15.s\n"
- "st1w { z17.s }, p0, [x23, x26, LSL #2]\n"
- "fmin z16.s, p3/M, z16.s, z14.s\n"
- "st1w { z16.s }, p0, [x23, x22, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
+ "fmax z31.s, p3/M, z31.s, z14.s\n"
+ "fmin z16.s, p3/M, z16.s, z13.s\n"
+ "st1w { z16.s }, p0, [x15]\n"
+ "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "fmin z18.s, p3/M, z18.s, z13.s\n"
+ "st1w { z17.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z19.s, p3/M, z19.s, z13.s\n"
+ "fmin z20.s, p3/M, z20.s, z13.s\n"
+ "st1w { z18.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z13.s\n"
+ "fmin z22.s, p3/M, z22.s, z13.s\n"
+ "st1w { z19.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "fmin z24.s, p3/M, z24.s, z13.s\n"
+ "st1w { z20.s }, p0, [x9]\n"
+ "fmin z25.s, p3/M, z25.s, z13.s\n"
+ "fmin z26.s, p3/M, z26.s, z13.s\n"
+ "st1w { z21.s }, p0, [x9, x6, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z13.s\n"
+ "fmin z28.s, p3/M, z28.s, z13.s\n"
+ "st1w { z22.s }, p0, [x9, x25, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z13.s\n"
+ "fmin z30.s, p3/M, z30.s, z13.s\n"
+ "st1w { z23.s }, p0, [x9, x22, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z13.s\n"
+ "st1w { z24.s }, p0, [x26]\n"
+ "st1w { z25.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z26.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z27.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "st1w { z29.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z30.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x22, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 158d44046c..0b04ae064d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,613 +98,613 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cntw x12\n"
- "ld1w { z13.s }, p3/Z, [x15]\n"
- "sub x11, XZR, x12\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "cntw x15\n"
+ "mov x14, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x10, x9, [x14, #0x0]\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1w { z9.s }, p2/Z, [x10, x13, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "sub x13, XZR, x15\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z13\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x14, #0x20]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x25, [x14, #0x28]\n"
- "incw x11\n"
- "movprfx z29, z13\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x24, [x14, #0x30]\n"
- "mov p0.b, p2.b\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z4.s, z9.s\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z3.s, z9.s\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z22, z13\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z21, z13\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x13, LSL #2]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "ldr x26, [x14, #0x60]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z26.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "ldr x21, [x16, #0x8]\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z3.s, z12.s\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z22.s, p3/M, z2.s, z12.s\n"
- "ldr x19, [x16, #0x18]\n"
- "fmla z21.s, p3/M, z1.s, z12.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "movprfx z19, z13\n fmla z19.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z26.s, p3/M, z7.s, z9.s\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z25.s, p3/M, z6.s, z9.s\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "fmla z22.s, p3/M, z4.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z9.s\n"
- "fmla z19.s, p3/M, z2.s, z9.s\n"
- "movprfx z18, z13\n fmla z18.s, p3/M, z1.s, z9.s\n"
- "movprfx z17, z13\n fmla z17.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x27, [x14, #0x98]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "ld1w { z13.s }, p3/Z, [x15]\n"
- "fmla z24.s, p3/M, z6.s, z10.s\n"
- "fmla z22.s, p3/M, z5.s, z10.s\n"
- "fmla z21.s, p3/M, z4.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x26, [x16, #0x30]\n"
+ "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "ldr x24, [x16, #0x38]\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z11.s\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z19.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z2.s, z10.s\n"
- "fmla z26.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z11.s\n"
- "ldr x9, [x14, #0xc8]\n"
- "fmla z16.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z19.s, p3/M, z7.s, z11.s\n"
- "ldr x27, [x14, #0xd8]\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
- "ldr x28, [x14, #0xd0]\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
- "fmla z26.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z17.s, p3/M, z8.s, z11.s\n"
- "ldr x26, [x14, #0xe0]\n"
- "fmla z16.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ldr x24, [x14, #0xf0]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x14, #0xf8]\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "ldr x27, [x16, #0x60]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "incw x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ldr x23, [x28, #0x0]\n"
+ "ldr x22, [x28, #0x8]\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "whilelt p0.s, x15, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z18.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z11.s\n"
- "fmla z18.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ldr x10, [x14, #0x100]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z10.s\n"
- "ldr x9, [x14, #0x108]\n"
+ "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "ldr x28, [x14, #0x110]\n"
- "fmla z24.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z5.s, z11.s\n"
- "fmla z20.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z11.s\n"
- "fmla z16.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x14, #0x118]\n"
- "fmla z24.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z19.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z11.s\n"
- "fmla z19.s, p3/M, z5.s, z11.s\n"
- "fmla z18.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z12.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
- "fmla z16.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z19.s, p3/M, z8.s, z10.s\n"
- "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0xc8]\n"
+ "fmla z17.s, p3/M, z5.s, z12.s\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z1.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ldr x9, [x16, #0xd8]\n"
+ "fmla z28.s, p3/M, z7.s, z11.s\n"
+ "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0xd0]\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z21.s, p3/M, z7.s, z11.s\n"
- "fmla z20.s, p3/M, z6.s, z11.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z17.s, p3/M, z4.s, z11.s\n"
- "fmla z16.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "ldp x10, x9, [x14, #0x0]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
- "ld1w { z9.s }, p1/Z, [x10, x12, LSL #2]\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z16.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x13, LSL #2]\n"
- "incw x13\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "ldr x12, [x16, #0x100]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0x108]\n"
+ "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x110]\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "ldr x9, [x16, #0x118]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z11.s\n"
- "ldp x28, x27, [x14, #0x10]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x28, x12, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "fmla z19.s, p3/M, z4.s, z12.s\n"
- "fmla z18.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x27, x12, LSL #2]\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z10.s\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldp x12, x11, [x16, #0x0]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p1/Z, [x9, x12, LSL #2]\n"
- "incw x12\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "fmin z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z14.s\n"
- "fmin z29.s, p3/M, z29.s, z14.s\n"
- "ldr x22, [x16, #0x20]\n"
- "fmin z28.s, p3/M, z28.s, z14.s\n"
- "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z14.s\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z26.s, p3/M, z26.s, z14.s\n"
- "ldr x20, [x16, #0x30]\n"
- "fmin z25.s, p3/M, z25.s, z14.s\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z24.s, p3/M, z24.s, z14.s\n"
- "ldr x22, [x16, #0x40]\n"
- "fmin z23.s, p3/M, z23.s, z14.s\n"
- "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
- "fmax z22.s, p3/M, z22.s, z15.s\n"
- "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z21.s, p3/M, z21.s, z15.s\n"
- "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
- "fmax z19.s, p3/M, z19.s, z15.s\n"
- "ldr x21, [x16, #0x48]\n"
- "fmin z22.s, p3/M, z22.s, z14.s\n"
- "ldr x20, [x16, #0x50]\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "ldr x19, [x16, #0x58]\n"
- "fmin z20.s, p3/M, z20.s, z14.s\n"
- "ldr x22, [x16, #0x60]\n"
- "fmin z19.s, p3/M, z19.s, z14.s\n"
- "st1w { z22.s }, p0, [x21, x11, LSL #2]\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "st1w { z21.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "st1w { z20.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z16.s, p3/M, z16.s, z15.s\n"
- "st1w { z19.s }, p0, [x22, x11, LSL #2]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmin z18.s, p3/M, z18.s, z14.s\n"
- "ldr x20, [x16, #0x70]\n"
- "fmin z17.s, p3/M, z17.s, z14.s\n"
- "ldr x19, [x16, #0x78]\n"
- "fmin z16.s, p3/M, z16.s, z14.s\n"
- "st1w { z18.s }, p0, [x21, x11, LSL #2]\n"
- "st1w { z17.s }, p0, [x20, x11, LSL #2]\n"
- "st1w { z16.s }, p0, [x19, x11, LSL #2]\n"
+ "fmax z16.s, p3/M, z16.s, z14.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z14.s\n"
+ "fmax z18.s, p3/M, z18.s, z14.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmax z19.s, p3/M, z19.s, z14.s\n"
+ "fmin z16.s, p3/M, z16.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmin z18.s, p3/M, z18.s, z13.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmin z19.s, p3/M, z19.s, z13.s\n"
+ "fmax z20.s, p3/M, z20.s, z14.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
+ "fmax z22.s, p3/M, z22.s, z14.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x20]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ldp x10, x9, [x16, #0x10]\n"
+ "fmin z20.s, p3/M, z20.s, z13.s\n"
+ "fmin z21.s, p3/M, z21.s, z13.s\n"
+ "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "fmin z22.s, p3/M, z22.s, z13.s\n"
+ "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x28, #0x48]\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x50]\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x58]\n"
+ "incw x14\n"
+ "ld1w { z9.s }, p0/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x11, x15, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z13.s\n"
+ "ld1w { z11.s }, p0/Z, [x10, x15, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x9, x15, LSL #2]\n"
+ "incw x15\n"
+ "fmin z25.s, p3/M, z25.s, z13.s\n"
+ "fmin z26.s, p3/M, z26.s, z13.s\n"
+ "fmin z27.s, p3/M, z27.s, z13.s\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x60]\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x28, #0x68]\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
+ "fmax z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x70]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x78]\n"
+ "ld1w { z15.s }, p3/Z, [x17]\n"
+ "whilelt p2.s, x14, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "fmin z28.s, p3/M, z28.s, z13.s\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "fmin z29.s, p3/M, z29.s, z13.s\n"
+ "fmin z30.s, p3/M, z30.s, z13.s\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z31.s, p3/M, z31.s, z13.s\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z13\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x14, #0x20]\n"
- "incw x11\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z7.s, z9.s\n"
- "ldr x25, [x14, #0x28]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z13\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x24, [x14, #0x30]\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "ldr x23, [x14, #0x38]\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z4.s, z9.s\n"
- "ldr x10, [x14, #0x40]\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z3.s, z9.s\n"
- "ldr x9, [x14, #0x48]\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z9.s\n"
- "ldr x28, [x14, #0x50]\n"
- "movprfx z22, z13\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "ldr x27, [x14, #0x58]\n"
- "movprfx z21, z13\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x13, LSL #2]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z12.s\n"
- "ldr x26, [x14, #0x60]\n"
- "fmla z29.s, p3/M, z7.s, z12.s\n"
- "ldr x25, [x14, #0x68]\n"
- "fmla z26.s, p3/M, z5.s, z12.s\n"
- "ldr x24, [x14, #0x70]\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ldr x22, [x16, #0x0]\n"
- "fmla z25.s, p3/M, z4.s, z12.s\n"
- "ldr x21, [x16, #0x8]\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z3.s, z12.s\n"
- "ldr x20, [x16, #0x10]\n"
- "fmla z22.s, p3/M, z2.s, z12.s\n"
- "ldr x19, [x16, #0x18]\n"
- "fmla z21.s, p3/M, z1.s, z12.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "movprfx z19, z13\n fmla z19.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
- "ldr x23, [x14, #0x78]\n"
- "fmla z26.s, p3/M, z7.s, z9.s\n"
- "ldr x10, [x14, #0x80]\n"
- "fmla z25.s, p3/M, z6.s, z9.s\n"
- "ldr x9, [x14, #0x88]\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "fmla z22.s, p3/M, z4.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z9.s\n"
- "fmla z19.s, p3/M, z2.s, z9.s\n"
- "movprfx z18, z13\n fmla z18.s, p3/M, z1.s, z9.s\n"
- "movprfx z17, z13\n fmla z17.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x28, [x14, #0x90]\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x27, [x14, #0x98]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z8.s, z10.s\n"
- "ldr x26, [x14, #0xa0]\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z6.s, z10.s\n"
- "fmla z22.s, p3/M, z5.s, z10.s\n"
- "fmla z21.s, p3/M, z4.s, z10.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z15\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x26, [x16, #0x30]\n"
+ "movprfx z22, z15\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "ldr x24, [x16, #0x38]\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z17, z15\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "ldr x12, [x16, #0x40]\n"
+ "ldr x11, [x16, #0x48]\n"
+ "movprfx z18, z15\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "ldr x10, [x16, #0x50]\n"
+ "ldr x9, [x16, #0x58]\n"
+ "movprfx z20, z15\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ldr x25, [x14, #0xa8]\n"
- "fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z11.s\n"
- "ldr x23, [x14, #0xb8]\n"
- "fmla z19.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "ldr x24, [x14, #0xb0]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z2.s, z10.s\n"
- "fmla z26.s, p3/M, z1.s, z10.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z11.s\n"
- "ldr x9, [x14, #0xc8]\n"
- "fmla z16.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ldr x10, [x14, #0xc0]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z19.s, p3/M, z7.s, z11.s\n"
- "ldr x27, [x14, #0xd8]\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
- "ldr x28, [x14, #0xd0]\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
- "fmla z26.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z22.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z17.s, p3/M, z8.s, z11.s\n"
- "ldr x26, [x14, #0xe0]\n"
- "fmla z16.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "ldr x25, [x14, #0xe8]\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z5.s, z12.s\n"
- "fmla z24.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z2.s, z12.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ldr x24, [x14, #0xf0]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z7.s, z11.s\n"
- "ldr x23, [x14, #0xf8]\n"
- "fmla z26.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "movprfx z19, z15\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "ldr x27, [x16, #0x60]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "incw x13\n"
+ "mov p1.b, p2.b\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ldr x23, [x28, #0x0]\n"
+ "ldr x22, [x28, #0x8]\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "movprfx z31, z15\n fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0x80]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z0.s, z12.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z18.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z11.s\n"
- "fmla z18.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "ldr x10, [x14, #0x100]\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z10.s\n"
- "ldr x9, [x14, #0x108]\n"
+ "ld1w { z11.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
"fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "ldr x28, [x14, #0x110]\n"
- "fmla z24.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z5.s, z11.s\n"
- "fmla z20.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z2.s, z11.s\n"
- "fmla z16.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z12.s\n"
- "ldr x27, [x14, #0x118]\n"
- "fmla z24.s, p3/M, z5.s, z12.s\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z10.s\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "fmla z19.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z8.s, z11.s\n"
- "fmla z19.s, p3/M, z5.s, z11.s\n"
- "fmla z18.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z24.s, p3/M, z8.s, z12.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
- "fmla z16.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z19.s, p3/M, z8.s, z10.s\n"
- "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ldr x9, [x16, #0x98]\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "ldr x12, [x16, #0xc0]\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0xc8]\n"
+ "fmla z17.s, p3/M, z5.s, z12.s\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z1.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ldr x9, [x16, #0xd8]\n"
+ "fmla z28.s, p3/M, z7.s, z11.s\n"
+ "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0xd0]\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x10, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z11.s\n"
- "fmla z21.s, p3/M, z7.s, z11.s\n"
- "fmla z20.s, p3/M, z6.s, z11.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "fmla z17.s, p3/M, z4.s, z11.s\n"
- "fmla z16.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x27, [x16, #0xe0]\n"
"fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z16.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
- "fmla z26.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "ldr x12, [x16, #0x100]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z19.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "ldr x11, [x16, #0x108]\n"
+ "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x10, [x16, #0x110]\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z12.s\n"
+ "ldr x9, [x16, #0x118]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z2.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "fmla z19.s, p3/M, z4.s, z12.s\n"
- "fmla z18.s, p3/M, z3.s, z12.s\n"
- "fmla z21.s, p3/M, z8.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmin z31.s, p3/M, z31.s, z14.s\n"
- "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z14.s\n"
- "fmin z29.s, p3/M, z29.s, z14.s\n"
- "ldr x22, [x16, #0x20]\n"
- "fmin z28.s, p3/M, z28.s, z14.s\n"
- "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "ldr x21, [x16, #0x28]\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "ldr x20, [x16, #0x30]\n"
- "fmin z27.s, p3/M, z27.s, z14.s\n"
- "ldr x19, [x16, #0x38]\n"
- "fmin z26.s, p3/M, z26.s, z14.s\n"
- "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z14.s\n"
- "fmin z24.s, p3/M, z24.s, z14.s\n"
- "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z14.s\n"
- "ldr x22, [x16, #0x40]\n"
- "fmax z22.s, p3/M, z22.s, z15.s\n"
- "ldr x21, [x16, #0x48]\n"
- "fmax z21.s, p3/M, z21.s, z15.s\n"
- "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z19.s, p3/M, z19.s, z15.s\n"
- "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
- "fmin z22.s, p3/M, z22.s, z14.s\n"
- "ldr x20, [x16, #0x50]\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "ldr x19, [x16, #0x58]\n"
- "fmin z20.s, p3/M, z20.s, z14.s\n"
- "ldr x22, [x16, #0x60]\n"
- "fmin z19.s, p3/M, z19.s, z14.s\n"
- "st1w { z22.s }, p0, [x21, x11, LSL #2]\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "st1w { z21.s }, p0, [x20, x11, LSL #2]\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "st1w { z20.s }, p0, [x19, x11, LSL #2]\n"
- "fmax z16.s, p3/M, z16.s, z15.s\n"
- "st1w { z19.s }, p0, [x22, x11, LSL #2]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmin z18.s, p3/M, z18.s, z14.s\n"
- "ldr x20, [x16, #0x70]\n"
- "fmin z17.s, p3/M, z17.s, z14.s\n"
- "ldr x19, [x16, #0x78]\n"
- "fmin z16.s, p3/M, z16.s, z14.s\n"
- "st1w { z18.s }, p0, [x21, x11, LSL #2]\n"
- "st1w { z17.s }, p0, [x20, x11, LSL #2]\n"
- "st1w { z16.s }, p0, [x19, x11, LSL #2]\n"
+ "fmax z16.s, p3/M, z16.s, z14.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z14.s\n"
+ "fmax z18.s, p3/M, z18.s, z14.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmax z19.s, p3/M, z19.s, z14.s\n"
+ "fmin z16.s, p3/M, z16.s, z13.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmin z17.s, p3/M, z17.s, z13.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "fmin z18.s, p3/M, z18.s, z13.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmin z19.s, p3/M, z19.s, z13.s\n"
+ "fmax z20.s, p3/M, z20.s, z14.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
+ "fmax z22.s, p3/M, z22.s, z14.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z16.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z17.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x20]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "st1w { z18.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "st1w { z19.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmin z20.s, p3/M, z20.s, z13.s\n"
+ "fmin z21.s, p3/M, z21.s, z13.s\n"
+ "fmin z22.s, p3/M, z22.s, z13.s\n"
+ "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "fmin z23.s, p3/M, z23.s, z13.s\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x28, #0x48]\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x50]\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x58]\n"
+ "fmin z24.s, p3/M, z24.s, z13.s\n"
+ "fmin z25.s, p3/M, z25.s, z13.s\n"
+ "fmin z26.s, p3/M, z26.s, z13.s\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x28, #0x60]\n"
+ "fmin z27.s, p3/M, z27.s, z13.s\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x28, #0x68]\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x28, #0x70]\n"
+ "fmax z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x28, #0x78]\n"
+ "fmin z28.s, p3/M, z28.s, z13.s\n"
+ "fmin z29.s, p3/M, z29.s, z13.s\n"
+ "fmin z30.s, p3/M, z30.s, z13.s\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z13.s\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index ac33dcbce5..5a1f309b88 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,246 +88,246 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x7, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x16, #0x0\n"
"1:" // Tile loop
- "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x4\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x17, #0x2\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x15, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cntw x14\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x12, XZR, x14\n"
- "ldr x21, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
- "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x8, x13, x19\n" // offset += tile_j * ld_input_col
- "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
- "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x21, x21, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x9, x21, x22, LSL #2\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x28, x9, x22, LSL #2\n"
- "ld1w { z17.s }, p3/Z, [x16]\n"
- "add x27, x28, x22, LSL #2\n"
- "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "add x26, x27, x22, LSL #2\n"
- "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "add x25, x13, x13\n"
- "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
- "add x24, x25, x13\n"
- "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "add x23, x24, x13\n"
- "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
- "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
- "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "madd x19, x8, x11, x19\n" // offset += tile_j * ld_output_col
- "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
- "mul x19, x19, x17\n" // offset *= output_tile_size
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x13\n"
+ "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x15, x15\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x12, x12, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x12, x23, LSL #2\n"
+ "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z9.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x21]\n"
- "add x10, x10, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "add x22, x10, x20, LSL #2\n"
- "ld1w { z12.s }, p2/Z, [x21, x24, LSL #2]\n"
- "addvl x16, x16, #16\n"
- "ld1w { z13.s }, p2/Z, [x21, x23, LSL #2]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "ld1w { z14.s }, p2/Z, [x9]\n"
- "ld1w { z15.s }, p2/Z, [x9, x13, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x21, x25, LSL #2]\n"
+ "ld1w { z19.s }, p3/Z, [x11]\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "add x27, x28, x23, LSL #2\n"
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "add x26, x10, x15\n"
+ "add x25, x27, x23, LSL #2\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x24, x26, x15\n"
+ "add x9, x9, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "cmp x13, %x[n_channels]\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "add x22, x9, x21, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x13\n"
+ "ld1w { z9.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x26, LSL #2]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1w { z13.s }, p2/Z, [x12, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x28]\n"
+ "ld1w { z15.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z17\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x14, %x[n_channels]\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z6.s, z9.s\n"
- "incw x12\n"
- "movprfx z29, z17\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ld1w { z17.s }, p3/Z, [x16]\n"
- "incw x15\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "addvl x21, x21, #1\n"
- "ld1w { z10.s }, p1/Z, [x21]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "incw x14\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x9, x23, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x27]\n"
- "addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z0.s, z16.s\n"
- "fmla z29.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x23, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x28]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z15.s\n"
- "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
+ "incw x21\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "incw x13\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z14.s }, p2/Z, [x25]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x23, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x27]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "addvl x12, x12, #1\n"
"addvl x28, x28, #1\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "ld1w { z9.s }, p1/Z, [x28, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x26]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
- "fmla z29.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x25, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x27, x25, LSL #2]\n"
- "addvl x27, x27, #1\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x21, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z13.s }, p1/Z, [x21, x23, LSL #2]\n"
- "fmax z31.s, p3/M, z31.s, z19.s\n"
- "fmla z28.s, p3/M, z5.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x26, x24, LSL #2]\n"
- "fmax z30.s, p3/M, z30.s, z19.s\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "fmin z31.s, p3/M, z31.s, z18.s\n"
- "st1w { z31.s }, p0, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x23, LSL #2]\n"
- "whilelt p2.s, x15, %x[n_channels]\n"
- "fmin z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
- "addvl x26, x26, #1\n"
- "fmla z28.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21, x25, LSL #2]\n"
- "cmp x14, %x[n_channels]\n"
- "fmax z29.s, p3/M, z29.s, z19.s\n"
- "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "st1w { z30.s }, p0, [x10, x11, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [x9]\n"
- "addvl x10, x10, #1\n"
- "fmin z29.s, p3/M, z29.s, z18.s\n"
- "st1w { z29.s }, p0, [x22]\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ld1w { z19.s }, p3/Z, [x11]\n"
+ "fmla z30.s, p3/M, z3.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z15.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "incw x20\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p1/Z, [x9, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x21, x13, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
- "fmax z28.s, p3/M, z28.s, z19.s\n"
- "addvl x16, x16, #16\n"
- "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "fmin z28.s, p3/M, z28.s, z18.s\n"
- "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "st1w { z28.s }, p0, [x22, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x23]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmla z30.s, p3/M, z6.s, z15.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z30.s, p3/M, z8.s, z15.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
+ "cmp x13, %x[n_channels]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "ld1w { z10.s }, p1/Z, [x12]\n"
+ "ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "addvl x25, x25, #1\n"
+ "ld1w { z14.s }, p1/Z, [x28]\n"
+ "ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
+ "st1w { z28.s }, p0, [x9]\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z17\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z6.s, z9.s\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x21, x7, #0x1\n"
- "movprfx z29, z17\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x8, x8, #0x1\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "cmp x8, x19\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x23, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
- "csel x8, x8, XZR, LT\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x9, x25, LSL #2]\n"
- "csel x7, x7, x21, LT\n"
- "fmla z31.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x27]\n"
- "cmp x7, x20\n"
- "fmla z30.s, p3/M, z0.s, z16.s\n"
- "fmla z29.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x27, x23, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x28]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z15.s\n"
- "fmla z31.s, p3/M, z2.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x28, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
"ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z14.s }, p2/Z, [x25]\n"
+ "add x16, x16, #0x1\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x23, LSL #2]\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x26]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "fmla z29.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x26, x25, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x27, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmax z31.s, p3/M, z31.s, z19.s\n"
- "fmla z28.s, p3/M, z5.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x26, x24, LSL #2]\n"
- "fmax z30.s, p3/M, z30.s, z19.s\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "fmin z31.s, p3/M, z31.s, z18.s\n"
- "st1w { z31.s }, p0, [x10]\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z8.s, z15.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x23, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z18.s\n"
- "st1w { z30.s }, p0, [x10, x11, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z16.s\n"
- "fmax z29.s, p3/M, z29.s, z19.s\n"
- "fmla z28.s, p3/M, z7.s, z14.s\n"
- "fmin z29.s, p3/M, z29.s, z18.s\n"
- "st1w { z29.s }, p0, [x22]\n"
+ "ld1w { z15.s }, p2/Z, [x27]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "cmp x16, x20\n"
+ "add x21, x11, #0x1\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z30.s, p3/M, z3.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z14.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z15.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "csel x11, x11, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x23, x26, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmax z28.s, p3/M, z28.s, z19.s\n"
- "fmin z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z28.s }, p0, [x22, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x23]\n"
+ "csel x16, x16, XZR, LT\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmla z30.s, p3/M, z6.s, z15.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "cmp x11, x20\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x9]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z30.s, p3/M, z8.s, z15.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x14, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 829b0ff2c7..eb6c2daa97 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,247 +87,247 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mov x13, #0x0\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cntw x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cntw x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z17.s }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x22, x13, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ld1w { z15.s }, p2/Z, [x20, x13, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "ld1w { z19.s }, p3/Z, [x16]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z17\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x14, #0x40]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z6.s, z9.s\n"
- "ldr x25, [x14, #0x48]\n"
- "incw x9\n"
- "movprfx z29, z17\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x24, [x14, #0x50]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ldr x23, [x14, #0x58]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z16.s\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z29.s, p3/M, z3.s, z14.s\n"
- "ldr x25, [x14, #0x88]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.s, p3/M, z4.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z15.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.s, p3/M, z2.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "ldr x20, [x14, #0xb0]\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "ld1w { z15.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x22, x13, LSL #2]\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z17.s }, p3/Z, [x15]\n"
- "fmla z29.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z28.s, p3/M, z5.s, z14.s\n"
- "fmax z31.s, p3/M, z31.s, z19.s\n"
- "ld1w { z14.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "fmax z30.s, p3/M, z30.s, z19.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "incw x13\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmin z31.s, p3/M, z31.s, z18.s\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "fmin z30.s, p3/M, z30.s, z18.s\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "fmla z28.s, p3/M, z3.s, z16.s\n"
- "ld1w { z9.s }, p1/Z, [x26, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z15.s\n"
- "ld1w { z10.s }, p1/Z, [x25, x12, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z14.s\n"
- "ld1w { z12.s }, p1/Z, [x23, x12, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x22, x12, LSL #2]\n"
- "fmax z29.s, p3/M, z29.s, z19.s\n"
- "ld1w { z14.s }, p1/Z, [x21, x12, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x88]\n"
+ "fmla z30.s, p3/M, z3.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z15.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p1/Z, [x20, x12, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x19, x12, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z18.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x24, x12, LSL #2]\n"
- "incw x12\n"
- "fmax z28.s, p3/M, z28.s, z19.s\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "blt 1b\n"
- "2:" // Channel tail
- "movprfx z31, z17\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x26, [x14, #0x40]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x23, [x15, #0xa0]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "fmla z30.s, p3/M, z6.s, z15.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "ldr x27, [x15, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z15.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"incw x9\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z6.s, z9.s\n"
- "ldr x25, [x14, #0x48]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "incw x28\n"
+ "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
"mov p0.b, p2.b\n"
- "movprfx z29, z17\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x24, [x14, #0x50]\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z0.s, z9.s\n"
- "ldr x23, [x14, #0x58]\n"
- "ldr x22, [x14, #0x60]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z16.s\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z29.s, p3/M, z3.s, z14.s\n"
- "ldr x25, [x14, #0x88]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.s, p3/M, z4.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z0.s, z15.s\n"
- "ld1w { z14.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.s, p3/M, z2.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
+ "ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "incw x14\n"
+ "ld1w { z19.s }, p3/Z, [x16]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z19\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x27, [x15, #0x40]\n"
+ "ldr x26, [x15, #0x48]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z31.s, p3/M, z5.s, z13.s\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "ldr x20, [x14, #0xb0]\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z31.s, p3/M, z6.s, z15.s\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "ld1w { z15.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x22, x13, LSL #2]\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z6.s, z15.s\n"
- "ld1w { z15.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z28.s, p3/M, z5.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmax z31.s, p3/M, z31.s, z19.s\n"
- "fmla z29.s, p3/M, z7.s, z13.s\n"
- "fmax z30.s, p3/M, z30.s, z19.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z18.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "fmla z28.s, p3/M, z3.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z18.s\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z14.s\n"
- "fmla z29.s, p3/M, z8.s, z15.s\n"
+ "ld1w { z14.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ld1w { z15.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x88]\n"
+ "fmla z30.s, p3/M, z3.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z15.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x9, LSL #2]\n"
"fmla z28.s, p3/M, z6.s, z15.s\n"
- "fmax z29.s, p3/M, z29.s, z19.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmin z29.s, p3/M, z29.s, z18.s\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "fmax z28.s, p3/M, z28.s, z19.s\n"
- "fmin z28.s, p3/M, z28.s, z18.s\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x23, [x15, #0xa0]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "fmla z30.s, p3/M, z6.s, z15.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z13.s\n"
+ "fmla z31.s, p3/M, z7.s, z14.s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "ldr x27, [x15, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z15.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "incw x28\n"
+ "mov p0.b, p2.b\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z28.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z29.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index ea8bbbd7e8..b4cf6c8582 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,432 +88,432 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x5, #0x0\n"
- "mov x6, #0x0\n"
+ "mov x12, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x20, #0x2\n"
- "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "mov x7, #0x2\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "mov x17, #0x0\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "cntw x16\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "sub x14, XZR, x16\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+ "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
"ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
- "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
- "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x13, x13, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "add x15, x17, x17\n"
+ "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cntw x12\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x11, x14, x23, LSL #2\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x11, x23, LSL #2\n"
+ "add x28, x15, x17\n"
"ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "add x20, x13, x22, LSL #2\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x10, x20, x22, LSL #2\n"
- "ld1w { z16.s }, p3/Z, [x8]\n"
- "add x9, x10, x22, LSL #2\n"
- "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
- "add x28, x9, x22, LSL #2\n"
- "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
- "add x27, x28, x22, LSL #2\n"
- "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
- "add x26, x15, x15\n"
- "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
- "add x25, x26, x15\n"
- "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
- "add x24, x25, x15\n"
- "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
- "add x23, x24, x15\n"
- "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
- "mul x19, x19, x7\n" // offset *= output_tile_size
- "add x11, x11, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x22, x11, x21, LSL #2\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z5.s }, p2/Z, [x13]\n"
- "ld1w { z6.s }, p2/Z, [x13, x15, LSL #2]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1w { z7.s }, p2/Z, [x20]\n"
- "addvl x8, x8, #6\n"
- "ld1w { z8.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x13, x26, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x20, x26, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x13, x25, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x13, x24, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x20, x23, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x10]\n"
+ "add x27, x9, x23, LSL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x26, x28, x17\n"
+ "add x25, x27, x23, LSL #2\n"
+ "ld1w { z16.s }, p3/Z, [x10]\n"
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x24, x26, x17\n"
+ "add x13, x13, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "add x22, x13, x21, LSL #2\n"
+ "mov x21, #0x0\n"
+ "ld1w { z5.s }, p2/Z, [x14]\n"
+ "ld1w { z6.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "sub x20, XZR, x12\n"
+ "ld1w { z7.s }, p2/Z, [x11]\n"
+ "ld1w { z8.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "addvl x10, x10, #6\n"
+ "ld1w { z9.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x9]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
- "whilelt p1.s, x16, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z6.s\n"
- "incw x14\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z7.s\n"
+ "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
+ "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
+ "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z0.s }, p3/Z, [x10]\n"
+ "incw x21\n"
+ "fmla z28.s, p3/M, z1.s, z6.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "incw x12\n"
+ "fmla z30.s, p3/M, z1.s, z8.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
"mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x8]\n"
- "incw x17\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
- "addvl x20, x20, #1\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "incw x16\n"
- "fmla z29.s, p3/M, z1.s, z8.s\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
- "addvl x13, x13, #1\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z5.s\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z6.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z7.s\n"
- "ld1w { z7.s }, p1/Z, [x20]\n"
- "fmla z30.s, p3/M, z0.s, z8.s\n"
- "fmla z29.s, p3/M, z0.s, z14.s\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
- "addvl x10, x10, #1\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
"fmla z28.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmla z31.s, p3/M, z3.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x9]\n"
- "ld1w { z16.s }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z5.s\n"
- "fmla z28.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z6.s\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
- "addvl x9, x9, #1\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x28]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z6.s\n"
- "fmla z29.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "incw x20\n"
+ "fmla z30.s, p3/M, z3.s, z5.s\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z6.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z7.s\n"
+ "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z7.s }, p1/Z, [x11]\n"
+ "fmla z30.s, p3/M, z0.s, z14.s\n"
+ "fmla z31.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z8.s\n"
"fmla z29.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x8]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
- "addvl x28, x28, #1\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
"fmla z29.s, p3/M, z2.s, z5.s\n"
- "fmla z28.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z28.s, p3/M, z3.s, z5.s\n"
"fmla z29.s, p3/M, z3.s, z6.s\n"
- "fmla z28.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z14.s\n"
- "ld1w { z14.s }, p1/Z, [x10]\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z5.s }, p2/Z, [x27]\n"
+ "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z6.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "ld1w { z13.s }, p1/Z, [x20, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z5.s\n"
+ "fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z9.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z5.s }, p1/Z, [x13]\n"
- "fmla z30.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
"fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
- "whilelt p2.s, x17, %x[n_channels]\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
"addvl x27, x27, #1\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z6.s }, p1/Z, [x13, x15, LSL #2]\n"
- "addvl x8, x8, #16\n"
- "fmla z30.s, p3/M, z3.s, z8.s\n"
- "cmp x16, %x[n_channels]\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x13, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p1/Z, [x20, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p1/Z, [x20, x23, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x13, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p1/Z, [x13, x26, LSL #2]\n"
- "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "addvl x8, x8, #-6\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x25]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z5.s\n"
+ "fmla z29.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z6.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p3/Z, [x10]\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z30.s, p3/M, z2.s, z5.s\n"
+ "fmla z31.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x23]\n"
+ "fmla z30.s, p3/M, z3.s, z6.s\n"
+ "fmla z31.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x9]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "fmla z29.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "fmla z29.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z5.s }, p1/Z, [x14]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "fmla z29.s, p3/M, z3.s, z8.s\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "cmp x12, %x[n_channels]\n"
+ "addvl x23, x23, #1\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
"fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x11]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
"fmin z28.s, p3/M, z28.s, z17.s\n"
- "addvl x11, x11, #1\n"
- "st1w { z29.s }, p0, [x22]\n"
- "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
+ "ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
+ "st1w { z28.s }, p0, [x13]\n"
+ "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "addvl x10, x10, #-6\n"
+ "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
"addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
- "mov p0.b, p2.b\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z6.s\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x5, #0x1\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z7.s\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z8.s\n"
- "ld1w { z0.s }, p3/Z, [x8]\n"
- "add x6, x6, #0x1\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z5.s\n"
+ "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z7.s\n"
+ "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z0.s }, p3/Z, [x10]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z28.s, p3/M, z1.s, z6.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z30.s, p3/M, z1.s, z8.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x8, x8, #0x1\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "cmp x8, x20\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "add x21, x12, #0x1\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x17, LSL #2]\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z29.s, p3/M, z1.s, z8.s\n"
- "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x6, x19\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z3.s, z5.s\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "csel x12, x12, x21, LT\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z6.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z7.s\n"
+ "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "csel x8, x8, XZR, LT\n"
+ "cmp x12, x20\n"
+ "fmla z30.s, p3/M, z0.s, z14.s\n"
+ "fmla z31.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z8.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z8.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
- "csel x6, x6, XZR, LT\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "csel x5, x5, x21, LT\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "cmp x5, x20\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z5.s\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z6.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z7.s\n"
- "fmla z30.s, p3/M, z0.s, z8.s\n"
- "fmla z29.s, p3/M, z0.s, z14.s\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z28.s, p3/M, z3.s, z5.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z5.s }, p2/Z, [x27]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z6.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z6.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z5.s\n"
+ "fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x25]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z5.s\n"
+ "fmla z29.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z6.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z6.s }, p2/Z, [x25, x28, LSL #2]\n"
"fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p3/Z, [x10]\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x24, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z5.s\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
- "addvl x8, x8, #16\n"
- "fmla z31.s, p3/M, z3.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x9]\n"
+ "fmla z31.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x23]\n"
"fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z5.s\n"
- "fmla z28.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "fmla z29.s, p3/M, z1.s, z5.s\n"
"fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z6.s\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "fmla z29.s, p3/M, z2.s, z6.s\n"
"fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x28]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z6.s\n"
- "fmla z29.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x8]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "fmla z28.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "fmla z28.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z14.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z30.s, p3/M, z1.s, z5.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "fmla z30.s, p3/M, z2.s, z6.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "fmla z30.s, p3/M, z3.s, z8.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z9.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "fmla z29.s, p3/M, z3.s, z8.s\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
"fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x11]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
"fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z29.s }, p0, [x22]\n"
- "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z28.s }, p0, [x13]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z29.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x16, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 070270764c..cb70bd2b6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,450 +98,450 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x15, x14, [x20, #0x0]\n"
"mov x13, #0x0\n"
- "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cntw x12\n"
- "ldp x11, x10, [x19, #0x0]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x19, #0x10]\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z16.s }, p3/Z, [x15]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
- "addvl x15, x15, #6\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "ld1w { z5.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ld1w { z6.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ld1w { z7.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ld1w { z8.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ld1w { z9.s }, p2/Z, [x22, x13, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "ldp x26, x25, [x14, #0x40]\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "ldp x12, x11, [x20, #0x10]\n"
+ "whilelt p3.s, XZR, %x[n_channels]\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "cntw x28\n"
+ "ptrue p2.b\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1w { z5.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "cmp x28, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldp x26, x25, [x16, #0x10]\n"
+ "sub x24, XZR, x28\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ldp x10, x9, [x16, #0x40]\n"
+ "ld1rw { z18.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z17.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z16.s }, p2/Z, [x27]\n"
+ "ld1w { z0.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x27, #5, MUL VL]\n"
+ "ld1w { z7.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "addvl x27, x27, #6\n"
+ "ld1w { z8.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "ld1w { z14.s }, p3/Z, [x9, x13, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z5.s\n"
- "ldr x24, [x14, #0x50]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z6.s\n"
- "ldr x23, [x14, #0x58]\n"
- "incw x9\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z7.s\n"
- "ldr x22, [x14, #0x60]\n"
- "mov p0.b, p2.b\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z8.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x15]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z29.s, p3/M, z1.s, z8.s\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ldr x25, [x14, #0x88]\n"
- "fmla z29.s, p3/M, z3.s, z5.s\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z6.s\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z7.s\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z30.s, p3/M, z0.s, z8.s\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z29.s, p3/M, z0.s, z14.s\n"
- "ldr x20, [x14, #0xb0]\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "ldr x25, [x14, #0xc8]\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmla z31.s, p3/M, z3.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x14, #0xd0]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "ld1w { z16.s }, p3/Z, [x15, #4, MUL VL]\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x14, #0xd8]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "ldr x22, [x14, #0xe0]\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "ldr x19, [x14, #0xf8]\n"
- "fmla z29.s, p3/M, z0.s, z5.s\n"
- "fmla z28.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #-6, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x14, #0xe8]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z6.s\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #-5, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x14, #0xf0]\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #-4, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x14, #0x100]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #-3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x14, #0x108]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #-2, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x14, #0x110]\n"
- "fmla z30.s, p3/M, z0.s, z6.s\n"
- "fmla z29.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #-1, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x14, #0x118]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x15]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "fmla z28.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "fmla z28.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z14.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldp x26, x25, [x14, #0x0]\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #5, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z30.s, p3/M, z1.s, z5.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z9.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #6, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "ld1w { z5.s }, p1/Z, [x26, x12, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z6.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
+ "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
+ "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
+ "ldr x25, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmla z28.s, p2/M, z1.s, z6.s\n"
+ "fmla z29.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z30.s, p2/M, z1.s, z8.s\n"
+ "fmla z31.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.s, p2/M, z2.s, z9.s\n"
+ "fmla z29.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z2.s, z13.s\n"
+ "fmla z31.s, p2/M, z2.s, z5.s\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.s, p2/M, z3.s, z11.s\n"
+ "fmla z29.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla z30.s, p2/M, z3.s, z5.s\n"
+ "fmla z31.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla z28.s, p2/M, z4.s, z12.s\n"
+ "fmla z29.s, p2/M, z4.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z6.s\n"
+ "fmla z31.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p2/M, z0.s, z7.s\n"
+ "fmla z29.s, p2/M, z0.s, z8.s\n"
+ "ldr x25, [x16, #0x98]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.s, p2/M, z0.s, z14.s\n"
+ "fmla z31.s, p2/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.s, p2/M, z1.s, z8.s\n"
+ "fmla z29.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.s, p2/M, z1.s, z11.s\n"
+ "fmla z31.s, p2/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z28.s, p2/M, z2.s, z13.s\n"
+ "fmla z29.s, p2/M, z2.s, z5.s\n"
+ "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla z30.s, p2/M, z2.s, z12.s\n"
+ "fmla z31.s, p2/M, z2.s, z9.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "fmla z28.s, p2/M, z3.s, z5.s\n"
+ "fmla z29.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla z30.s, p2/M, z3.s, z9.s\n"
+ "fmla z31.s, p2/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ldr x26, [x16, #0xd0]\n"
+ "fmla z28.s, p2/M, z4.s, z6.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z13.s\n"
+ "fmla z31.s, p2/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
+ "ldr x25, [x16, #0xd8]\n"
+ "fmla z28.s, p2/M, z0.s, z14.s\n"
+ "fmla z29.s, p2/M, z0.s, z11.s\n"
+ "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.s, p2/M, z0.s, z5.s\n"
+ "fmla z31.s, p2/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z28.s, p2/M, z1.s, z11.s\n"
+ "fmla z29.s, p2/M, z1.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z30.s, p2/M, z1.s, z6.s\n"
+ "fmla z31.s, p2/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "fmla z28.s, p2/M, z2.s, z12.s\n"
+ "fmla z29.s, p2/M, z2.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z30.s, p2/M, z2.s, z10.s\n"
+ "fmla z31.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
+ "incw x24\n"
+ "fmla z28.s, p2/M, z3.s, z9.s\n"
+ "fmla z29.s, p2/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z31.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z28.s, p2/M, z4.s, z13.s\n"
+ "fmla z29.s, p2/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z31.s, p2/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla z28.s, p2/M, z0.s, z5.s\n"
+ "fmla z29.s, p2/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ldr x26, [x16, #0x110]\n"
+ "fmla z30.s, p2/M, z0.s, z9.s\n"
+ "fmla z31.s, p2/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "fmla z28.s, p2/M, z1.s, z6.s\n"
+ "fmla z29.s, p2/M, z1.s, z10.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0x118]\n"
+ "fmla z30.s, p2/M, z1.s, z13.s\n"
+ "fmla z31.s, p2/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p2/Z, [x27]\n"
+ "fmla z28.s, p2/M, z2.s, z10.s\n"
+ "fmla z29.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z2.s, z5.s\n"
+ "fmla z31.s, p2/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z28.s, p2/M, z3.s, z11.s\n"
+ "fmla z29.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z3.s, z6.s\n"
+ "fmla z31.s, p2/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.s, p2/M, z4.s, z12.s\n"
+ "fmla z29.s, p2/M, z4.s, z14.s\n"
+ "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z8.s\n"
+ "fmla z31.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "fmla z28.s, p2/M, z0.s, z9.s\n"
+ "fmla z29.s, p2/M, z0.s, z13.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z0.s, z11.s\n"
+ "fmla z31.s, p2/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla z28.s, p2/M, z1.s, z13.s\n"
+ "fmla z29.s, p2/M, z1.s, z5.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
+ "fmla z30.s, p2/M, z1.s, z12.s\n"
+ "fmla z31.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
+ "fmla z28.s, p2/M, z2.s, z5.s\n"
+ "fmla z29.s, p2/M, z2.s, z6.s\n"
+ "ld1w { z5.s }, p1/Z, [x10, x28, LSL #2]\n"
+ "fmla z30.s, p2/M, z2.s, z9.s\n"
+ "fmla z31.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldp x26, x25, [x16, #0x10]\n"
+ "fmla z28.s, p2/M, z3.s, z6.s\n"
+ "fmla z29.s, p2/M, z3.s, z8.s\n"
+ "ld1w { z6.s }, p1/Z, [x9, x28, LSL #2]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z31.s, p2/M, z3.s, z12.s\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ldp x10, x9, [x16, #0x40]\n"
+ "fmla z28.s, p2/M, z4.s, z8.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
"incw x13\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ldp x24, x23, [x14, #0x10]\n"
- "whilelt p2.s, x13, %x[n_channels]\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "ld1w { z6.s }, p1/Z, [x25, x12, LSL #2]\n"
- "ldp x22, x21, [x14, #0x20]\n"
- "fmla z30.s, p3/M, z3.s, z8.s\n"
- "ldp x20, x19, [x14, #0x30]\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "ld1w { z7.s }, p1/Z, [x24, x12, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z13.s }, p1/Z, [x21, x12, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p1/Z, [x23, x12, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z11.s }, p1/Z, [x20, x12, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p1/Z, [x19, x12, LSL #2]\n"
- "fmla z28.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p1/Z, [x22, x12, LSL #2]\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "ldp x26, x25, [x14, #0x40]\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #7, MUL VL]\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "addvl x15, x15, #16\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
- "ld1w { z14.s }, p1/Z, [x25, x12, LSL #2]\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "incw x12\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "addvl x15, x15, #-6\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
+ "ld1w { z7.s }, p1/Z, [x26, x28, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z31.s, p2/M, z4.s, z9.s\n"
+ "ld1w { z8.s }, p1/Z, [x25, x28, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x21, x28, LSL #2]\n"
+ "fmax z28.s, p2/M, z28.s, z18.s\n"
+ "fmax z29.s, p2/M, z29.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x10, x28, LSL #2]\n"
+ "fmax z30.s, p2/M, z30.s, z18.s\n"
+ "fmax z31.s, p2/M, z31.s, z18.s\n"
+ "ld1w { z14.s }, p1/Z, [x9, x28, LSL #2]\n"
+ "incw x28\n"
+ "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "whilelt p3.s, x13, %x[n_channels]\n"
+ "cmp x28, %x[n_channels]\n"
+ "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
+ "fmin z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z17.s\n"
+ "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
+ "fmin z30.s, p2/M, z30.s, z17.s\n"
+ "fmin z31.s, p2/M, z31.s, z17.s\n"
+ "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
+ "addvl x27, x27, #-6\n"
+ "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z31, z16\n fmla z31.s, p3/M, z0.s, z5.s\n"
- "ldr x24, [x14, #0x50]\n"
- "incw x9\n"
- "movprfx z30, z16\n fmla z30.s, p3/M, z0.s, z6.s\n"
- "ldr x23, [x14, #0x58]\n"
- "mov p0.b, p2.b\n"
- "movprfx z29, z16\n fmla z29.s, p3/M, z0.s, z7.s\n"
- "ldr x22, [x14, #0x60]\n"
- "movprfx z28, z16\n fmla z28.s, p3/M, z0.s, z8.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x15]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z9.s\n"
- "ldr x21, [x14, #0x68]\n"
- "fmla z29.s, p3/M, z1.s, z8.s\n"
- "fmla z28.s, p3/M, z1.s, z13.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #1, MUL VL]\n"
- "ldr x20, [x14, #0x70]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "ldr x19, [x14, #0x78]\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "fmla z28.s, p3/M, z2.s, z5.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #2, MUL VL]\n"
- "ldr x26, [x14, #0x80]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "ldr x25, [x14, #0x88]\n"
- "fmla z29.s, p3/M, z3.s, z5.s\n"
- "fmla z28.s, p3/M, z3.s, z6.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #3, MUL VL]\n"
- "ldr x24, [x14, #0x90]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z6.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #4, MUL VL]\n"
- "ldr x23, [x14, #0x98]\n"
- "fmla z31.s, p3/M, z0.s, z7.s\n"
- "ldr x22, [x14, #0xa0]\n"
- "fmla z30.s, p3/M, z0.s, z8.s\n"
- "ldr x21, [x14, #0xa8]\n"
- "fmla z29.s, p3/M, z0.s, z14.s\n"
- "fmla z28.s, p3/M, z0.s, z11.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #5, MUL VL]\n"
- "ldr x20, [x14, #0xb0]\n"
- "fmla z31.s, p3/M, z1.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "ldr x19, [x14, #0xb8]\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #6, MUL VL]\n"
- "ldr x25, [x14, #0xc8]\n"
- "fmla z31.s, p3/M, z2.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z5.s\n"
- "ldr x26, [x14, #0xc0]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z9.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
- "fmla z31.s, p3/M, z3.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x14, #0xd0]\n"
- "fmla z30.s, p3/M, z3.s, z6.s\n"
- "fmla z29.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z3.s, z13.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #-8, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x14, #0xd8]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "fmla z28.s, p3/M, z4.s, z8.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #-7, MUL VL]\n"
- "ldr x22, [x14, #0xe0]\n"
- "fmla z31.s, p3/M, z0.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "ldr x19, [x14, #0xf8]\n"
- "fmla z29.s, p3/M, z0.s, z5.s\n"
- "fmla z28.s, p3/M, z0.s, z6.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #-6, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x14, #0xe8]\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z29.s, p3/M, z1.s, z6.s\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "ld1w { z1.s }, p3/Z, [x15, #-5, MUL VL]\n"
- "fmla z31.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x14, #0xf0]\n"
- "fmla z30.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z2.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #-4, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "ldr x26, [x14, #0x100]\n"
- "fmla z30.s, p3/M, z3.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #-3, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z13.s\n"
- "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x14, #0x108]\n"
- "fmla z30.s, p3/M, z4.s, z8.s\n"
- "ld1w { z8.s }, p2/Z, [x22, x13, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z14.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #-2, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z5.s\n"
- "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x14, #0x110]\n"
- "fmla z30.s, p3/M, z0.s, z6.s\n"
- "fmla z29.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z0.s }, p3/Z, [x15, #-1, MUL VL]\n"
- "fmla z31.s, p3/M, z1.s, z6.s\n"
- "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x14, #0x118]\n"
- "fmla z30.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z13.s\n"
- "fmla z28.s, p3/M, z1.s, z5.s\n"
- "ld1w { z1.s }, p3/Z, [x15]\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z5.s\n"
- "fmla z28.s, p3/M, z2.s, z6.s\n"
- "ld1w { z2.s }, p3/Z, [x15, #1, MUL VL]\n"
- "fmla z31.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z6.s\n"
- "fmla z28.s, p3/M, z3.s, z8.s\n"
- "ld1w { z3.s }, p3/Z, [x15, #2, MUL VL]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z14.s\n"
- "fmla z29.s, p3/M, z4.s, z8.s\n"
- "fmla z28.s, p3/M, z4.s, z10.s\n"
- "ld1w { z4.s }, p3/Z, [x15, #3, MUL VL]\n"
- "fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z30.s, p3/M, z1.s, z5.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z5.s\n"
- "fmla z30.s, p3/M, z2.s, z6.s\n"
- "fmla z29.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z3.s, z6.s\n"
- "fmla z30.s, p3/M, z3.s, z8.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z8.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z9.s\n"
- "fmax z31.s, p3/M, z31.s, z18.s\n"
- "fmax z30.s, p3/M, z30.s, z18.s\n"
- "fmax z29.s, p3/M, z29.s, z18.s\n"
- "fmax z28.s, p3/M, z28.s, z18.s\n"
- "fmin z31.s, p3/M, z31.s, z17.s\n"
- "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
- "fmin z30.s, p3/M, z30.s, z17.s\n"
- "fmin z29.s, p3/M, z29.s, z17.s\n"
- "st1w { z30.s }, p0, [x10, x9, LSL #2]\n"
- "fmin z28.s, p3/M, z28.s, z17.s\n"
- "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z28.s }, p0, [x27, x9, LSL #2]\n"
+ "movprfx z28, z16\n fmla z28.s, p2/M, z0.s, z5.s\n"
+ "movprfx z29, z16\n fmla z29.s, p2/M, z0.s, z6.s\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "movprfx z30, z16\n fmla z30.s, p2/M, z0.s, z7.s\n"
+ "movprfx z31, z16\n fmla z31.s, p2/M, z0.s, z8.s\n"
+ "ldr x25, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "fmla z28.s, p2/M, z1.s, z6.s\n"
+ "fmla z29.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z30.s, p2/M, z1.s, z8.s\n"
+ "fmla z31.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.s, p2/M, z2.s, z9.s\n"
+ "fmla z29.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z2.s, z13.s\n"
+ "fmla z31.s, p2/M, z2.s, z5.s\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.s, p2/M, z3.s, z11.s\n"
+ "fmla z29.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla z30.s, p2/M, z3.s, z5.s\n"
+ "fmla z31.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla z28.s, p2/M, z4.s, z12.s\n"
+ "fmla z29.s, p2/M, z4.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z6.s\n"
+ "fmla z31.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #4, MUL VL]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z28.s, p2/M, z0.s, z7.s\n"
+ "fmla z29.s, p2/M, z0.s, z8.s\n"
+ "ldr x25, [x16, #0x98]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.s, p2/M, z0.s, z14.s\n"
+ "fmla z31.s, p2/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #5, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.s, p2/M, z1.s, z8.s\n"
+ "fmla z29.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z8.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.s, p2/M, z1.s, z11.s\n"
+ "fmla z31.s, p2/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z28.s, p2/M, z2.s, z13.s\n"
+ "fmla z29.s, p2/M, z2.s, z5.s\n"
+ "ld1w { z13.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla z30.s, p2/M, z2.s, z12.s\n"
+ "fmla z31.s, p2/M, z2.s, z9.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #7, MUL VL]\n"
+ "addvl x27, x27, #16\n"
+ "fmla z28.s, p2/M, z3.s, z5.s\n"
+ "fmla z29.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla z30.s, p2/M, z3.s, z9.s\n"
+ "fmla z31.s, p2/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #-8, MUL VL]\n"
+ "ldr x26, [x16, #0xd0]\n"
+ "fmla z28.s, p2/M, z4.s, z6.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z13.s\n"
+ "fmla z31.s, p2/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #-7, MUL VL]\n"
+ "ldr x25, [x16, #0xd8]\n"
+ "fmla z28.s, p2/M, z0.s, z14.s\n"
+ "fmla z29.s, p2/M, z0.s, z11.s\n"
+ "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.s, p2/M, z0.s, z5.s\n"
+ "fmla z31.s, p2/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #-6, MUL VL]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z28.s, p2/M, z1.s, z11.s\n"
+ "fmla z29.s, p2/M, z1.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z30.s, p2/M, z1.s, z6.s\n"
+ "fmla z31.s, p2/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, #-5, MUL VL]\n"
+ "incw x24\n"
+ "fmla z28.s, p2/M, z2.s, z12.s\n"
+ "fmla z29.s, p2/M, z2.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z30.s, p2/M, z2.s, z10.s\n"
+ "fmla z31.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z28.s, p2/M, z3.s, z9.s\n"
+ "fmla z29.s, p2/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z31.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #-3, MUL VL]\n"
+ "fmla z28.s, p2/M, z4.s, z13.s\n"
+ "fmla z29.s, p2/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z31.s, p2/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #-2, MUL VL]\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla z28.s, p2/M, z0.s, z5.s\n"
+ "fmla z29.s, p2/M, z0.s, z6.s\n"
+ "ld1w { z5.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ldr x26, [x16, #0x110]\n"
+ "fmla z30.s, p2/M, z0.s, z9.s\n"
+ "fmla z31.s, p2/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, #-1, MUL VL]\n"
+ "fmla z28.s, p2/M, z1.s, z6.s\n"
+ "fmla z29.s, p2/M, z1.s, z10.s\n"
+ "ld1w { z6.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0x118]\n"
+ "fmla z30.s, p2/M, z1.s, z13.s\n"
+ "fmla z31.s, p2/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p2/Z, [x27]\n"
+ "fmla z28.s, p2/M, z2.s, z10.s\n"
+ "fmla z29.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z2.s, z5.s\n"
+ "fmla z31.s, p2/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "fmla z28.s, p2/M, z3.s, z11.s\n"
+ "fmla z29.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z3.s, z6.s\n"
+ "fmla z31.s, p2/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "fmla z28.s, p2/M, z4.s, z12.s\n"
+ "fmla z29.s, p2/M, z4.s, z14.s\n"
+ "ld1w { z12.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z4.s, z8.s\n"
+ "fmla z31.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "fmla z28.s, p2/M, z0.s, z9.s\n"
+ "fmla z29.s, p2/M, z0.s, z13.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z0.s, z11.s\n"
+ "fmla z31.s, p2/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x9, x13, LSL #2]\n"
+ "fmla z28.s, p2/M, z1.s, z13.s\n"
+ "fmla z29.s, p2/M, z1.s, z5.s\n"
+ "fmla z30.s, p2/M, z1.s, z12.s\n"
+ "fmla z31.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z28.s, p2/M, z2.s, z5.s\n"
+ "fmla z29.s, p2/M, z2.s, z6.s\n"
+ "fmla z30.s, p2/M, z2.s, z9.s\n"
+ "fmla z31.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z28.s, p2/M, z3.s, z6.s\n"
+ "fmla z29.s, p2/M, z3.s, z8.s\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z31.s, p2/M, z3.s, z12.s\n"
+ "fmla z28.s, p2/M, z4.s, z8.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "fmax z28.s, p2/M, z28.s, z18.s\n"
+ "fmax z29.s, p2/M, z29.s, z18.s\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z31.s, p2/M, z4.s, z9.s\n"
+ "fmax z30.s, p2/M, z30.s, z18.s\n"
+ "fmax z31.s, p2/M, z31.s, z18.s\n"
+ "fmin z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z17.s\n"
+ "st1w { z28.s }, p0, [x15, x24, LSL #2]\n"
+ "fmin z30.s, p2/M, z30.s, z17.s\n"
+ "fmin z31.s, p2/M, z31.s, z17.s\n"
+ "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+ "st1w { z30.s }, p0, [x12, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x11, x24, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index eac77516c2..204f36edca 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,118 +45,118 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ptrue p1.b\n"
- "ld1rw { z4.s }, p1/Z, [%x[minmax_vals]]\n"
- "mov x28, #0x0\n"
- "ld1rw { z3.s }, p1/Z, [%x[minmax_vals], #4]\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "mov x11, #0x0\n"
+ "ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
"1:" // Channel loop
- "mov z2.b, #0x0\n"
+ "mov z23.b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ld1w { z2.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
"2:" // Channel loop: Load bias: Done
- "mov z1.d, z2.d\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldp x24, x23, [x10], #0x10\n"
+ "ldp x22, x21, [x10], #0x10\n"
+ "mov z24.d, z23.d\n"
+ "mov z25.d, z23.d\n"
+ "ldr x20, [x10], #0x8\n"
+ "mov z26.d, z23.d\n"
+ "mov z27.d, z23.d\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
- "mov x22, %x[inptrs]\n"
- "mov z31.d, z2.d\n"
- "ldp x20, x19, [x22], #0x10\n"
- "subs x21, %x[n_points], #0x1\n"
- "mov z30.d, z2.d\n"
- "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
- "mov z28.d, z2.d\n"
+ "mov z28.d, z23.d\n"
+ "mov z29.d, z23.d\n"
+ "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "mov z30.d, z23.d\n"
+ "mov z31.d, z23.d\n"
+ "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
- "mov z27.d, z2.d\n"
- "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
- "mov z25.d, z2.d\n"
- "ldp x20, x19, [x22], #0x10\n"
- "mov z24.d, z2.d\n"
- "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
- "mov z22.d, z2.d\n"
- "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
- "ldp x20, x19, [x22], #0x10\n"
- "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
- "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
- "ldp x20, x19, [x22], #0x10\n"
- "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
- "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
- "ldr x19, [x22], #0x8\n"
- "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "fmla z2.s, p1/M, z29.s, z0.s\n"
- "ldp x20, x19, [x22], #0x10\n"
- "subs x21, x21, #0x1\n"
- "fmla z1.s, p1/M, z26.s, z0.s\n"
- "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
- "fmla z31.s, p1/M, z23.s, z0.s\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "subs x25, x25, #0x1\n"
+ "fmla z23.s, p1/M, z14.s, z0.s\n"
+ "ldp x24, x23, [x10], #0x10\n"
+ "ldp x22, x21, [x10], #0x10\n"
+ "fmla z24.s, p1/M, z15.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z0.s\n"
+ "ldr x20, [x10], #0x8\n"
+ "fmla z26.s, p1/M, z17.s, z0.s\n"
+ "fmla z27.s, p1/M, z18.s, z0.s\n"
+ "ld1w { z14.s }, p0/Z, [x9, x11, LSL #2]\n"
+ "fmla z28.s, p1/M, z19.s, z0.s\n"
+ "fmla z29.s, p1/M, z20.s, z0.s\n"
+ "ld1w { z15.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x27, x11, LSL #2]\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
- "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
- "fmla z28.s, p1/M, z20.s, z0.s\n"
- "ldp x20, x19, [x22], #0x10\n"
- "fmla z27.s, p1/M, z19.s, z0.s\n"
- "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
- "fmla z25.s, p1/M, z18.s, z0.s\n"
- "fmla z24.s, p1/M, z17.s, z0.s\n"
- "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
- "fmla z22.s, p1/M, z16.s, z0.s\n"
+ "fmla z31.s, p1/M, z22.s, z0.s\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "ld1w { z17.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
- "ldp x20, x19, [x22], #0x10\n"
- "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
- "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
- "ldp x20, x19, [x22], #0x10\n"
- "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
- "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
- "ldr x19, [x22], #0x8\n"
- "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
- "fmla z2.s, p1/M, z29.s, z0.s\n"
- "ldp x27, x26, [%x[outptrs], #0x0]\n"
- "fmla z1.s, p1/M, z26.s, z0.s\n"
- "ldp x25, x24, [%x[outptrs], #0x10]\n"
- "fmla z31.s, p1/M, z23.s, z0.s\n"
- "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "fmla z23.s, p1/M, z14.s, z0.s\n"
+ "fmla z24.s, p1/M, z15.s, z0.s\n"
+ "fmax z23.s, p1/M, z23.s, z2.s\n"
+ "fmax z24.s, p1/M, z24.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z0.s\n"
+ "fmla z26.s, p1/M, z17.s, z0.s\n"
+ "fmax z25.s, p1/M, z25.s, z2.s\n"
+ "fmax z26.s, p1/M, z26.s, z2.s\n"
+ "fmla z27.s, p1/M, z18.s, z0.s\n"
+ "fmla z28.s, p1/M, z19.s, z0.s\n"
+ "fmax z27.s, p1/M, z27.s, z2.s\n"
+ "fmax z28.s, p1/M, z28.s, z2.s\n"
+ "fmla z29.s, p1/M, z20.s, z0.s\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "fmla z28.s, p1/M, z20.s, z0.s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmla z27.s, p1/M, z19.s, z0.s\n"
- "fmla z25.s, p1/M, z18.s, z0.s\n"
- "fmla z24.s, p1/M, z17.s, z0.s\n"
- "fmla z22.s, p1/M, z16.s, z0.s\n"
- "fmax z2.s, p1/M, z2.s, z4.s\n"
- "fmax z1.s, p1/M, z1.s, z4.s\n"
- "fmax z31.s, p1/M, z31.s, z4.s\n"
- "fmax z30.s, p1/M, z30.s, z4.s\n"
- "fmin z2.s, p1/M, z2.s, z3.s\n"
- "st1w { z2.s }, p0, [x27, x28, LSL #2]\n"
- "fmin z1.s, p1/M, z1.s, z3.s\n"
- "fmin z31.s, p1/M, z31.s, z3.s\n"
- "st1w { z1.s }, p0, [x26, x28, LSL #2]\n"
- "fmin z30.s, p1/M, z30.s, z3.s\n"
- "fmax z28.s, p1/M, z28.s, z4.s\n"
- "st1w { z31.s }, p0, [x25, x28, LSL #2]\n"
- "fmax z27.s, p1/M, z27.s, z4.s\n"
- "st1w { z30.s }, p0, [x24, x28, LSL #2]\n"
- "fmin z28.s, p1/M, z28.s, z3.s\n"
- "fmax z25.s, p1/M, z25.s, z4.s\n"
- "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
- "fmin z27.s, p1/M, z27.s, z3.s\n"
- "fmin z25.s, p1/M, z25.s, z3.s\n"
- "st1w { z27.s }, p0, [x22, x28, LSL #2]\n"
- "fmax z24.s, p1/M, z24.s, z4.s\n"
- "fmax z22.s, p1/M, z22.s, z4.s\n"
- "st1w { z25.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z24.s, p1/M, z24.s, z3.s\n"
- "st1w { z24.s }, p0, [x20, x28, LSL #2]\n"
- "fmin z22.s, p1/M, z22.s, z3.s\n"
- "st1w { z22.s }, p0, [x19, x28, LSL #2]\n"
- "incw x28\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
+ "fmax z29.s, p1/M, z29.s, z2.s\n"
+ "fmax z30.s, p1/M, z30.s, z2.s\n"
+ "fmla z31.s, p1/M, z22.s, z0.s\n"
+ "fmax z31.s, p1/M, z31.s, z2.s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmin z23.s, p1/M, z23.s, z1.s\n"
+ "fmin z24.s, p1/M, z24.s, z1.s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmin z25.s, p1/M, z25.s, z1.s\n"
+ "fmin z26.s, p1/M, z26.s, z1.s\n"
+ "st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
+ "fmin z27.s, p1/M, z27.s, z1.s\n"
+ "fmin z28.s, p1/M, z28.s, z1.s\n"
+ "st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
+ "fmin z29.s, p1/M, z29.s, z1.s\n"
+ "fmin z30.s, p1/M, z30.s, z1.s\n"
+ "st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
+ "fmin z31.s, p1/M, z31.s, z1.s\n"
+ "st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
+ "incw x11\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 395b112460..7ba0edd991 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,214 +42,214 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ldp x12, x11, [%x[outptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x10, x9, [%x[outptrs], #0x10]\n"
- "mov x28, #0x0\n"
- "ldp x27, x26, [%x[outptrs], #0x20]\n"
- "mov x25, #0x0\n"
- "ldp x24, x23, [%x[outptrs], #0x30]\n"
- "whilelt p1.s, x28, %x[channel_multiplier]\n"
- "ldr x22, [%x[outptrs], #0x40]\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "ld1rqw { z2.s }, p2/Z, [x21]\n"
- "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
- "ld1rqw { z4.s }, p2/Z, [x20]\n"
- "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
- "ld1rqw { z6.s }, p2/Z, [x19]\n"
- "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "ld1rqw { z8.s }, p2/Z, [x21]\n"
- "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
- "ld1rqw { z10.s }, p2/Z, [x20]\n"
- "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
- "ld1rqw { z12.s }, p2/Z, [x19]\n"
- "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
- "ldr x19, [%x[inptrs], #0x30]\n"
- "ld1rw { z26.s }, p2/Z, [%x[clamps]]\n"
- "ld1rw { z25.s }, p2/Z, [%x[clamps], #4]\n"
- "ld1rqw { z14.s }, p2/Z, [x19]\n"
- "ld1rqw { z15.s }, p2/Z, [x19, #16]\n"
- "ld1w { z24.s }, p1/Z, [%x[params]]\n"
- "mov z23.d, z24.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mov z22.d, z24.d\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "mov z21.d, z24.d\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov x17, #0x0\n"
+ "whilelt p2.s, x17, %x[channel_multiplier]\n"
+ "ldr x16, [%x[inptrs], #0x0]\n"
+ "ldr x15, [%x[inptrs], #0x8]\n"
+ "ptrue p1.b\n"
+ "ldr x14, [%x[inptrs], #0x10]\n"
+ "ldr x13, [%x[inptrs], #0x18]\n"
+ "mov x12, #0x0\n"
+ "ldr x11, [%x[inptrs], #0x20]\n"
+ "ldr x10, [%x[inptrs], #0x28]\n"
+ "ldr x9, [%x[inptrs], #0x30]\n"
+ "ld1w { z26.s }, p2/Z, [%x[params]]\n"
+ "mov z25.d, z26.d\n"
+ "mov z24.d, z26.d\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "mov z23.d, z26.d\n"
+ "mov z22.d, z26.d\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "mov z21.d, z26.d\n"
+ "mov z20.d, z26.d\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "ld1rqw { z2.s }, p1/Z, [x16]\n"
+ "mov z19.d, z26.d\n"
+ "mov z18.d, z26.d\n"
+ "ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
+ "ld1rqw { z4.s }, p1/Z, [x15]\n"
+ "ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
+ "ld1rqw { z6.s }, p1/Z, [x14]\n"
+ "ld1rqw { z7.s }, p1/Z, [x14, #16]\n"
+ "ld1rqw { z8.s }, p1/Z, [x13]\n"
+ "ld1rqw { z9.s }, p1/Z, [x13, #16]\n"
+ "ld1rqw { z10.s }, p1/Z, [x11]\n"
+ "ld1rqw { z11.s }, p1/Z, [x11, #16]\n"
+ "ld1rqw { z12.s }, p1/Z, [x10]\n"
+ "ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
+ "ld1rqw { z14.s }, p1/Z, [x9]\n"
+ "ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
+ "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
- "mov z20.d, z24.d\n"
- "mov z19.d, z24.d\n"
- "mov z18.d, z24.d\n"
- "mov z17.d, z24.d\n"
- "mov z16.d, z24.d\n"
"1:" // Output channel complete vector loop
+ "fmla z26.s, z31.s, z2.s[0]\n"
+ "fmla z23.s, z31.s, z6.s[0]\n"
"mov z0.d, z10.d\n"
- "mov p0.b, p1.b\n"
+ "incw x17\n"
+ "fmla z22.s, z31.s, z6.s[2]\n"
+ "fmla z21.s, z31.s, z7.s[0]\n"
"mov z1.d, z11.d\n"
- "incw x28\n"
- "fmla z24.s, z31.s, z2.s[0]\n"
- "whilelt p1.s, x28, %x[channel_multiplier]\n"
- "fmla z23.s, z31.s, z2.s[2]\n"
- "fmla z22.s, z31.s, z3.s[0]\n"
- "fmla z21.s, z31.s, z6.s[0]\n"
- "fmla z20.s, z31.s, z6.s[2]\n"
- "fmla z19.s, z31.s, z7.s[0]\n"
- "fmla z18.s, z31.s, z0.s[0]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
- "fmla z16.s, z31.s, z1.s[0]\n"
- "ld1w { z31.s }, p2/Z, [%x[params]]\n"
- "fmla z24.s, z30.s, z2.s[1]\n"
- "fmla z23.s, z30.s, z2.s[3]\n"
- "fmla z22.s, z30.s, z3.s[1]\n"
- "fmla z21.s, z30.s, z6.s[1]\n"
- "fmla z20.s, z30.s, z6.s[3]\n"
- "fmla z19.s, z30.s, z7.s[1]\n"
- "fmla z18.s, z30.s, z0.s[1]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
- "fmla z16.s, z30.s, z1.s[1]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "fmla z24.s, z29.s, z2.s[2]\n"
- "fmla z23.s, z29.s, z3.s[0]\n"
- "fmla z22.s, z29.s, z3.s[2]\n"
- "fmla z21.s, z29.s, z6.s[2]\n"
- "fmla z20.s, z29.s, z7.s[0]\n"
- "fmla z19.s, z29.s, z7.s[2]\n"
- "fmla z18.s, z29.s, z0.s[2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z25.s, z31.s, z2.s[2]\n"
+ "fmla z24.s, z31.s, z3.s[0]\n"
+ "whilelt p2.s, x17, %x[channel_multiplier]\n"
+ "fmla z20.s, z31.s, z0.s[0]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z18.s, z31.s, z1.s[0]\n"
+ "fmla z26.s, z30.s, z2.s[1]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params]]\n"
+ "fmla z23.s, z30.s, z6.s[1]\n"
+ "fmla z22.s, z30.s, z6.s[3]\n"
+ "fmla z21.s, z30.s, z7.s[1]\n"
+ "fmla z25.s, z30.s, z2.s[3]\n"
+ "fmla z24.s, z30.s, z3.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[1]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
+ "fmla z18.s, z30.s, z1.s[1]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z26.s, z29.s, z2.s[2]\n"
+ "fmla z23.s, z29.s, z6.s[2]\n"
+ "fmla z22.s, z29.s, z7.s[0]\n"
+ "fmla z21.s, z29.s, z7.s[2]\n"
+ "fmla z25.s, z29.s, z3.s[0]\n"
+ "fmla z24.s, z29.s, z3.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[2]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
"mov z0.d, z8.d\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[2]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z18.s, z29.s, z1.s[2]\n"
"mov z1.d, z9.d\n"
- "fmla z24.s, z31.s, z4.s[0]\n"
- "fmla z23.s, z31.s, z4.s[2]\n"
- "fmla z22.s, z31.s, z5.s[0]\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[2]\n"
+ "fmla z26.s, z31.s, z4.s[0]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z23.s, z31.s, z0.s[0]\n"
+ "fmla z22.s, z31.s, z0.s[2]\n"
"mov z0.d, z12.d\n"
- "fmla z19.s, z31.s, z1.s[0]\n"
+ "fmla z21.s, z31.s, z1.s[0]\n"
"mov z1.d, z13.d\n"
- "fmla z18.s, z31.s, z0.s[0]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z25.s, z31.s, z4.s[2]\n"
+ "fmla z24.s, z31.s, z5.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[0]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z18.s, z31.s, z1.s[0]\n"
"mov z0.d, z8.d\n"
- "fmla z16.s, z31.s, z1.s[0]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #3, MUL VL]\n"
"mov z1.d, z9.d\n"
- "fmla z24.s, z30.s, z4.s[1]\n"
- "fmla z23.s, z30.s, z4.s[3]\n"
- "fmla z22.s, z30.s, z5.s[1]\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[3]\n"
+ "fmla z26.s, z30.s, z4.s[1]\n"
+ "fmla z23.s, z30.s, z0.s[1]\n"
+ "fmla z22.s, z30.s, z0.s[3]\n"
+ "fmla z21.s, z30.s, z1.s[1]\n"
"mov z0.d, z12.d\n"
- "fmla z19.s, z30.s, z1.s[1]\n"
"mov z1.d, z13.d\n"
- "fmla z18.s, z30.s, z0.s[1]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z25.s, z30.s, z4.s[3]\n"
+ "fmla z24.s, z30.s, z5.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[1]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
"mov z0.d, z8.d\n"
- "fmla z16.s, z30.s, z1.s[1]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z18.s, z30.s, z1.s[1]\n"
"mov z1.d, z9.d\n"
- "fmla z24.s, z29.s, z4.s[2]\n"
- "fmla z23.s, z29.s, z5.s[0]\n"
- "fmla z22.s, z29.s, z5.s[2]\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[2]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z23.s, z29.s, z0.s[2]\n"
+ "fmla z22.s, z29.s, z1.s[0]\n"
"mov z0.d, z12.d\n"
- "fmla z20.s, z29.s, z1.s[0]\n"
- "fmla z19.s, z29.s, z1.s[2]\n"
+ "fmla z21.s, z29.s, z1.s[2]\n"
"mov z1.d, z13.d\n"
- "fmla z18.s, z29.s, z0.s[2]\n"
+ "fmla z25.s, z29.s, z5.s[0]\n"
+ "fmla z24.s, z29.s, z5.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[2]\n"
"mov z0.d, z10.d\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[2]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z1.s[2]\n"
"mov z1.d, z11.d\n"
- "fmla z24.s, z31.s, z6.s[0]\n"
- "fmla z23.s, z31.s, z6.s[2]\n"
- "fmla z22.s, z31.s, z7.s[0]\n"
- "fmla z21.s, z31.s, z0.s[0]\n"
- "fmla z20.s, z31.s, z0.s[2]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z26.s, z31.s, z6.s[0]\n"
+ "fmla z23.s, z31.s, z0.s[0]\n"
+ "fmla z22.s, z31.s, z0.s[2]\n"
+ "fmla z21.s, z31.s, z1.s[0]\n"
"mov z0.d, z14.d\n"
- "fmla z19.s, z31.s, z1.s[0]\n"
"mov z1.d, z15.d\n"
- "fmla z18.s, z31.s, z0.s[0]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z25.s, z31.s, z6.s[2]\n"
+ "fmla z24.s, z31.s, z7.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[0]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
"mov z0.d, z10.d\n"
- "fmla z16.s, z31.s, z1.s[0]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z18.s, z31.s, z1.s[0]\n"
"mov z1.d, z11.d\n"
- "fmla z24.s, z30.s, z6.s[1]\n"
- "fmla z23.s, z30.s, z6.s[3]\n"
- "fmla z22.s, z30.s, z7.s[1]\n"
- "fmla z21.s, z30.s, z0.s[1]\n"
- "fmla z20.s, z30.s, z0.s[3]\n"
+ "fmla z26.s, z30.s, z6.s[1]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z23.s, z30.s, z0.s[1]\n"
+ "fmla z22.s, z30.s, z0.s[3]\n"
"mov z0.d, z14.d\n"
- "fmla z19.s, z30.s, z1.s[1]\n"
+ "fmla z21.s, z30.s, z1.s[1]\n"
"mov z1.d, z15.d\n"
- "fmla z18.s, z30.s, z0.s[1]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z25.s, z30.s, z6.s[3]\n"
+ "fmla z24.s, z30.s, z7.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[1]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
+ "fmla z18.s, z30.s, z1.s[1]\n"
"mov z0.d, z10.d\n"
- "fmla z16.s, z30.s, z1.s[1]\n"
"mov z1.d, z11.d\n"
- "fmla z24.s, z29.s, z6.s[2]\n"
- "fmla z23.s, z29.s, z7.s[0]\n"
- "fmla z22.s, z29.s, z7.s[2]\n"
- "fmla z21.s, z29.s, z0.s[2]\n"
+ "fmla z26.s, z29.s, z6.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[2]\n"
+ "fmin z26.s, p1/M, z26.s, z16.s\n"
+ "fmla z22.s, z29.s, z1.s[0]\n"
+ "fmla z21.s, z29.s, z1.s[2]\n"
"mov z0.d, z14.d\n"
- "fmla z20.s, z29.s, z1.s[0]\n"
- "fmla z19.s, z29.s, z1.s[2]\n"
+ "fmax z26.s, p1/M, z26.s, z17.s\n"
"mov z1.d, z15.d\n"
- "fmla z18.s, z29.s, z0.s[2]\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[2]\n"
- "fmin z24.s, p2/M, z24.s, z25.s\n"
- "fmin z23.s, p2/M, z23.s, z25.s\n"
- "fmin z22.s, p2/M, z22.s, z25.s\n"
- "fmin z21.s, p2/M, z21.s, z25.s\n"
- "fmax z24.s, p2/M, z24.s, z26.s\n"
- "st1w { z24.s }, p0, [x12, x25, LSL #2]\n"
- "fmax z23.s, p2/M, z23.s, z26.s\n"
- "fmax z22.s, p2/M, z22.s, z26.s\n"
- "ld1w { z24.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z25.s, z29.s, z7.s[0]\n"
+ "fmla z24.s, z29.s, z7.s[2]\n"
+ "fmin z25.s, p1/M, z25.s, z16.s\n"
+ "fmla z20.s, z29.s, z0.s[2]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmin z24.s, p1/M, z24.s, z16.s\n"
+ "fmin z23.s, p1/M, z23.s, z16.s\n"
+ "fmla z18.s, z29.s, z1.s[2]\n"
+ "fmin z22.s, p1/M, z22.s, z16.s\n"
+ "fmin z21.s, p1/M, z21.s, z16.s\n"
+ "st1w { z26.s }, p0, [x28, x12, LSL #2]\n"
+ "fmin z20.s, p1/M, z20.s, z16.s\n"
+ "fmin z19.s, p1/M, z19.s, z16.s\n"
+ "ld1w { z26.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "fmin z18.s, p1/M, z18.s, z16.s\n"
"addvl %x[params], %x[params], #16\n"
- "fmax z21.s, p2/M, z21.s, z26.s\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "fmin z20.s, p2/M, z20.s, z25.s\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "fmax z25.s, p1/M, z25.s, z17.s\n"
+ "fmax z24.s, p1/M, z24.s, z17.s\n"
+ "st1w { z25.s }, p0, [x27, x12, LSL #2]\n"
+ "mov z25.d, z26.d\n"
+ "fmax z23.s, p1/M, z23.s, z17.s\n"
+ "fmax z22.s, p1/M, z22.s, z17.s\n"
+ "st1w { z24.s }, p0, [x26, x12, LSL #2]\n"
+ "mov z24.d, z26.d\n"
+ "fmax z21.s, p1/M, z21.s, z17.s\n"
+ "fmax z20.s, p1/M, z20.s, z17.s\n"
+ "st1w { z23.s }, p0, [x25, x12, LSL #2]\n"
+ "mov z23.d, z26.d\n"
+ "fmax z19.s, p1/M, z19.s, z17.s\n"
+ "fmax z18.s, p1/M, z18.s, z17.s\n"
+ "st1w { z22.s }, p0, [x24, x12, LSL #2]\n"
+ "mov z22.d, z26.d\n"
+ "st1w { z21.s }, p0, [x23, x12, LSL #2]\n"
+ "mov z21.d, z26.d\n"
"addvl %x[params], %x[params], #-6\n"
- "fmin z19.s, p2/M, z19.s, z25.s\n"
- "st1w { z23.s }, p0, [x11, x25, LSL #2]\n"
- "mov z23.d, z24.d\n"
- "st1w { z22.s }, p0, [x10, x25, LSL #2]\n"
- "mov z22.d, z24.d\n"
- "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
- "mov z21.d, z24.d\n"
- "fmax z20.s, p2/M, z20.s, z26.s\n"
- "st1w { z20.s }, p0, [x27, x25, LSL #2]\n"
- "mov z20.d, z24.d\n"
- "fmax z19.s, p2/M, z19.s, z26.s\n"
- "st1w { z19.s }, p0, [x26, x25, LSL #2]\n"
- "mov z19.d, z24.d\n"
- "fmin z18.s, p2/M, z18.s, z25.s\n"
- "fmin z17.s, p2/M, z17.s, z25.s\n"
- "fmin z16.s, p2/M, z16.s, z25.s\n"
- "fmax z18.s, p2/M, z18.s, z26.s\n"
- "st1w { z18.s }, p0, [x24, x25, LSL #2]\n"
- "mov z18.d, z24.d\n"
- "fmax z17.s, p2/M, z17.s, z26.s\n"
- "st1w { z17.s }, p0, [x23, x25, LSL #2]\n"
- "mov z17.d, z24.d\n"
- "fmax z16.s, p2/M, z16.s, z26.s\n"
- "st1w { z16.s }, p0, [x22, x25, LSL #2]\n"
- "mov z16.d, z24.d\n"
- "incw x25\n"
+ "st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
+ "mov z20.d, z26.d\n"
+ "st1w { z19.s }, p0, [x21, x12, LSL #2]\n"
+ "mov z19.d, z26.d\n"
+ "st1w { z18.s }, p0, [x20, x12, LSL #2]\n"
+ "incw x12\n"
+ "mov z18.d, z26.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index e7193d625f..2ea116fc9e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,347 +42,347 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
const float minmax_vals[2] = { activation_min, activation_max };
__asm__ __volatile__(
- "ldp x11, x10, [%x[outptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x9, x28, [%x[outptrs], #0x10]\n"
- "mov x27, #0x0\n"
- "ldp x26, x25, [%x[outptrs], #0x20]\n"
- "mov x24, #0x0\n"
- "ldp x23, x22, [%x[outptrs], #0x30]\n"
- "whilelt p1.s, x27, %x[channel_multiplier]\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr x19, [%x[inptrs], #0x10]\n"
- "ld1rqw { z2.s }, p2/Z, [x21]\n"
- "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
- "ld1rqw { z4.s }, p2/Z, [x20]\n"
- "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
- "ld1rqw { z6.s }, p2/Z, [x19]\n"
- "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ldr x19, [%x[inptrs], #0x28]\n"
- "ld1rqw { z8.s }, p2/Z, [x21]\n"
- "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
- "ld1rqw { z10.s }, p2/Z, [x20]\n"
- "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
- "ld1rqw { z12.s }, p2/Z, [x19]\n"
- "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
- "ld1rw { z25.s }, p2/Z, [%x[clamps]]\n"
- "ld1rw { z24.s }, p2/Z, [%x[clamps], #4]\n"
- "ld1w { z23.s }, p1/Z, [%x[params]]\n"
- "mov z22.d, z23.d\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mov z21.d, z23.d\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "mov z20.d, z23.d\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "mov z19.d, z23.d\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #4, MUL VL]\n"
- "mov z18.d, z23.d\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "mov x15, #0x0\n"
+ "whilelt p2.s, x15, %x[channel_multiplier]\n"
+ "ldr x14, [%x[inptrs], #0x0]\n"
+ "ldr x13, [%x[inptrs], #0x8]\n"
+ "ptrue p1.b\n"
+ "ldr x12, [%x[inptrs], #0x10]\n"
+ "ldr x11, [%x[inptrs], #0x18]\n"
+ "mov x10, #0x0\n"
+ "ldr x9, [%x[inptrs], #0x20]\n"
+ "ldr x28, [%x[inptrs], #0x28]\n"
+ "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov z24.d, z25.d\n"
+ "mov z23.d, z25.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z22.d, z25.d\n"
+ "mov z21.d, z25.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "ld1rqw { z2.s }, p1/Z, [x14]\n"
+ "mov z20.d, z25.d\n"
+ "mov z19.d, z25.d\n"
+ "ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
+ "ld1rqw { z4.s }, p1/Z, [x13]\n"
+ "mov z18.d, z25.d\n"
+ "ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
+ "ld1rqw { z6.s }, p1/Z, [x12]\n"
+ "ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
+ "ld1rqw { z8.s }, p1/Z, [x11]\n"
+ "ld1rqw { z9.s }, p1/Z, [x11, #16]\n"
+ "ld1rqw { z10.s }, p1/Z, [x9]\n"
+ "ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
+ "ld1rqw { z12.s }, p1/Z, [x28]\n"
+ "ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
+ "ld1rw { z17.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #6\n"
- "mov z17.d, z23.d\n"
- "mov z16.d, z23.d\n"
"1:" // Output channel complete vector loop
+ "fmla z25.s, z31.s, z2.s[0]\n"
+ "fmla z24.s, z31.s, z2.s[1]\n"
"mov z0.d, z8.d\n"
- "mov p0.b, p1.b\n"
+ "incw x15\n"
+ "fmla z23.s, z31.s, z2.s[2]\n"
+ "fmla z22.s, z31.s, z2.s[3]\n"
"mov z1.d, z9.d\n"
- "incw x27\n"
- "fmla z23.s, z31.s, z2.s[0]\n"
- "whilelt p1.s, x27, %x[channel_multiplier]\n"
- "fmla z22.s, z31.s, z2.s[1]\n"
- "fmla z21.s, z31.s, z2.s[2]\n"
- "fmla z20.s, z31.s, z2.s[3]\n"
- "fmla z19.s, z31.s, z4.s[0]\n"
- "fmla z18.s, z31.s, z4.s[1]\n"
- "fmla z17.s, z31.s, z4.s[2]\n"
- "fmla z16.s, z31.s, z4.s[3]\n"
- "ld1w { z31.s }, p2/Z, [%x[params]]\n"
- "fmla z23.s, z30.s, z2.s[1]\n"
- "fmla z22.s, z30.s, z2.s[2]\n"
- "fmla z21.s, z30.s, z2.s[3]\n"
- "fmla z20.s, z30.s, z3.s[0]\n"
- "fmla z19.s, z30.s, z4.s[1]\n"
- "fmla z18.s, z30.s, z4.s[2]\n"
- "fmla z17.s, z30.s, z4.s[3]\n"
- "fmla z16.s, z30.s, z5.s[0]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "fmla z23.s, z29.s, z2.s[2]\n"
- "fmla z22.s, z29.s, z2.s[3]\n"
- "fmla z21.s, z29.s, z3.s[0]\n"
- "fmla z20.s, z29.s, z3.s[1]\n"
- "fmla z19.s, z29.s, z4.s[2]\n"
- "fmla z18.s, z29.s, z4.s[3]\n"
- "fmla z17.s, z29.s, z5.s[0]\n"
- "fmla z16.s, z29.s, z5.s[1]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "fmla z23.s, z28.s, z2.s[3]\n"
- "fmla z22.s, z28.s, z3.s[0]\n"
- "fmla z21.s, z28.s, z3.s[1]\n"
- "fmla z20.s, z28.s, z3.s[2]\n"
- "fmla z19.s, z28.s, z4.s[3]\n"
- "fmla z18.s, z28.s, z5.s[0]\n"
- "fmla z17.s, z28.s, z5.s[1]\n"
- "fmla z16.s, z28.s, z5.s[2]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "fmla z23.s, z27.s, z3.s[0]\n"
- "fmla z22.s, z27.s, z3.s[1]\n"
- "fmla z21.s, z27.s, z3.s[2]\n"
- "fmla z20.s, z27.s, z3.s[3]\n"
- "fmla z19.s, z27.s, z5.s[0]\n"
- "fmla z18.s, z27.s, z5.s[1]\n"
- "fmla z17.s, z27.s, z5.s[2]\n"
- "fmla z16.s, z27.s, z5.s[3]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "fmla z23.s, z31.s, z4.s[0]\n"
- "fmla z22.s, z31.s, z4.s[1]\n"
- "fmla z21.s, z31.s, z4.s[2]\n"
- "fmla z20.s, z31.s, z4.s[3]\n"
- "fmla z19.s, z31.s, z6.s[0]\n"
- "fmla z18.s, z31.s, z6.s[1]\n"
- "fmla z17.s, z31.s, z6.s[2]\n"
- "fmla z16.s, z31.s, z6.s[3]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z23.s, z30.s, z4.s[1]\n"
- "fmla z22.s, z30.s, z4.s[2]\n"
- "fmla z21.s, z30.s, z4.s[3]\n"
- "fmla z20.s, z30.s, z5.s[0]\n"
- "fmla z19.s, z30.s, z6.s[1]\n"
- "fmla z18.s, z30.s, z6.s[2]\n"
- "fmla z17.s, z30.s, z6.s[3]\n"
- "fmla z16.s, z30.s, z7.s[0]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmla z23.s, z29.s, z4.s[2]\n"
- "fmla z22.s, z29.s, z4.s[3]\n"
- "fmla z21.s, z29.s, z5.s[0]\n"
- "fmla z20.s, z29.s, z5.s[1]\n"
- "fmla z19.s, z29.s, z6.s[2]\n"
- "fmla z18.s, z29.s, z6.s[3]\n"
- "fmla z17.s, z29.s, z7.s[0]\n"
- "fmla z16.s, z29.s, z7.s[1]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z21.s, z31.s, z4.s[0]\n"
+ "fmla z20.s, z31.s, z4.s[1]\n"
+ "whilelt p2.s, x15, %x[channel_multiplier]\n"
+ "fmla z19.s, z31.s, z4.s[2]\n"
+ "fmla z18.s, z31.s, z4.s[3]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params]]\n"
+ "fmla z25.s, z30.s, z2.s[1]\n"
+ "fmla z24.s, z30.s, z2.s[2]\n"
+ "fmla z23.s, z30.s, z2.s[3]\n"
+ "fmla z22.s, z30.s, z3.s[0]\n"
+ "fmla z21.s, z30.s, z4.s[1]\n"
+ "fmla z20.s, z30.s, z4.s[2]\n"
+ "fmla z19.s, z30.s, z4.s[3]\n"
+ "fmla z18.s, z30.s, z5.s[0]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z25.s, z29.s, z2.s[2]\n"
+ "fmla z24.s, z29.s, z2.s[3]\n"
+ "fmla z23.s, z29.s, z3.s[0]\n"
+ "fmla z22.s, z29.s, z3.s[1]\n"
+ "fmla z21.s, z29.s, z4.s[2]\n"
+ "fmla z20.s, z29.s, z4.s[3]\n"
+ "fmla z19.s, z29.s, z5.s[0]\n"
+ "fmla z18.s, z29.s, z5.s[1]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z25.s, z28.s, z2.s[3]\n"
+ "fmla z24.s, z28.s, z3.s[0]\n"
+ "fmla z23.s, z28.s, z3.s[1]\n"
+ "fmla z22.s, z28.s, z3.s[2]\n"
+ "fmla z21.s, z28.s, z4.s[3]\n"
+ "fmla z20.s, z28.s, z5.s[0]\n"
+ "fmla z19.s, z28.s, z5.s[1]\n"
+ "fmla z18.s, z28.s, z5.s[2]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z25.s, z27.s, z3.s[0]\n"
+ "fmla z24.s, z27.s, z3.s[1]\n"
+ "fmla z23.s, z27.s, z3.s[2]\n"
+ "fmla z22.s, z27.s, z3.s[3]\n"
+ "fmla z21.s, z27.s, z5.s[0]\n"
+ "fmla z20.s, z27.s, z5.s[1]\n"
+ "fmla z19.s, z27.s, z5.s[2]\n"
+ "fmla z18.s, z27.s, z5.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z25.s, z31.s, z4.s[0]\n"
+ "fmla z24.s, z31.s, z4.s[1]\n"
+ "fmla z23.s, z31.s, z4.s[2]\n"
+ "fmla z22.s, z31.s, z4.s[3]\n"
+ "fmla z21.s, z31.s, z6.s[0]\n"
+ "fmla z20.s, z31.s, z6.s[1]\n"
+ "fmla z19.s, z31.s, z6.s[2]\n"
+ "fmla z18.s, z31.s, z6.s[3]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z25.s, z30.s, z4.s[1]\n"
+ "fmla z24.s, z30.s, z4.s[2]\n"
+ "fmla z23.s, z30.s, z4.s[3]\n"
+ "fmla z22.s, z30.s, z5.s[0]\n"
+ "fmla z21.s, z30.s, z6.s[1]\n"
+ "fmla z20.s, z30.s, z6.s[2]\n"
+ "fmla z19.s, z30.s, z6.s[3]\n"
+ "fmla z18.s, z30.s, z7.s[0]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z25.s, z29.s, z4.s[2]\n"
+ "fmla z24.s, z29.s, z4.s[3]\n"
+ "fmla z23.s, z29.s, z5.s[0]\n"
+ "fmla z22.s, z29.s, z5.s[1]\n"
+ "fmla z21.s, z29.s, z6.s[2]\n"
+ "fmla z20.s, z29.s, z6.s[3]\n"
+ "fmla z19.s, z29.s, z7.s[0]\n"
+ "fmla z18.s, z29.s, z7.s[1]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmla z23.s, z28.s, z4.s[3]\n"
- "fmla z22.s, z28.s, z5.s[0]\n"
- "fmla z21.s, z28.s, z5.s[1]\n"
- "fmla z20.s, z28.s, z5.s[2]\n"
- "fmla z19.s, z28.s, z6.s[3]\n"
- "fmla z18.s, z28.s, z7.s[0]\n"
- "fmla z17.s, z28.s, z7.s[1]\n"
- "fmla z16.s, z28.s, z7.s[2]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "fmla z23.s, z27.s, z5.s[0]\n"
- "fmla z22.s, z27.s, z5.s[1]\n"
- "fmla z21.s, z27.s, z5.s[2]\n"
- "fmla z20.s, z27.s, z5.s[3]\n"
- "fmla z19.s, z27.s, z7.s[0]\n"
- "fmla z18.s, z27.s, z7.s[1]\n"
- "fmla z17.s, z27.s, z7.s[2]\n"
- "fmla z16.s, z27.s, z7.s[3]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "fmla z23.s, z31.s, z6.s[0]\n"
- "fmla z22.s, z31.s, z6.s[1]\n"
- "fmla z21.s, z31.s, z6.s[2]\n"
- "fmla z20.s, z31.s, z6.s[3]\n"
- "fmla z19.s, z31.s, z0.s[0]\n"
- "fmla z18.s, z31.s, z0.s[1]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
- "fmla z16.s, z31.s, z0.s[3]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
- "fmla z23.s, z30.s, z6.s[1]\n"
- "fmla z22.s, z30.s, z6.s[2]\n"
- "fmla z21.s, z30.s, z6.s[3]\n"
- "fmla z20.s, z30.s, z7.s[0]\n"
- "fmla z19.s, z30.s, z0.s[1]\n"
- "fmla z18.s, z30.s, z0.s[2]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
- "fmla z16.s, z30.s, z1.s[0]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "fmla z23.s, z29.s, z6.s[2]\n"
- "fmla z22.s, z29.s, z6.s[3]\n"
- "fmla z21.s, z29.s, z7.s[0]\n"
- "fmla z20.s, z29.s, z7.s[1]\n"
- "fmla z19.s, z29.s, z0.s[2]\n"
- "fmla z18.s, z29.s, z0.s[3]\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[1]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "fmla z23.s, z28.s, z6.s[3]\n"
- "fmla z22.s, z28.s, z7.s[0]\n"
- "fmla z21.s, z28.s, z7.s[1]\n"
- "fmla z20.s, z28.s, z7.s[2]\n"
- "fmla z19.s, z28.s, z0.s[3]\n"
- "fmla z18.s, z28.s, z1.s[0]\n"
- "fmla z17.s, z28.s, z1.s[1]\n"
- "fmla z16.s, z28.s, z1.s[2]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "fmla z23.s, z27.s, z7.s[0]\n"
- "fmla z22.s, z27.s, z7.s[1]\n"
- "fmla z21.s, z27.s, z7.s[2]\n"
- "fmla z20.s, z27.s, z7.s[3]\n"
- "fmla z19.s, z27.s, z1.s[0]\n"
- "fmla z18.s, z27.s, z1.s[1]\n"
- "fmla z17.s, z27.s, z1.s[2]\n"
- "fmla z16.s, z27.s, z1.s[3]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[1]\n"
- "fmla z21.s, z31.s, z0.s[2]\n"
- "fmla z20.s, z31.s, z0.s[3]\n"
+ "fmla z25.s, z28.s, z4.s[3]\n"
+ "fmla z24.s, z28.s, z5.s[0]\n"
+ "fmla z23.s, z28.s, z5.s[1]\n"
+ "fmla z22.s, z28.s, z5.s[2]\n"
+ "fmla z21.s, z28.s, z6.s[3]\n"
+ "fmla z20.s, z28.s, z7.s[0]\n"
+ "fmla z19.s, z28.s, z7.s[1]\n"
+ "fmla z18.s, z28.s, z7.s[2]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z25.s, z27.s, z5.s[0]\n"
+ "fmla z24.s, z27.s, z5.s[1]\n"
+ "fmla z23.s, z27.s, z5.s[2]\n"
+ "fmla z22.s, z27.s, z5.s[3]\n"
+ "fmla z21.s, z27.s, z7.s[0]\n"
+ "fmla z20.s, z27.s, z7.s[1]\n"
+ "fmla z19.s, z27.s, z7.s[2]\n"
+ "fmla z18.s, z27.s, z7.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "fmla z25.s, z31.s, z6.s[0]\n"
+ "fmla z24.s, z31.s, z6.s[1]\n"
+ "fmla z23.s, z31.s, z6.s[2]\n"
+ "fmla z22.s, z31.s, z6.s[3]\n"
+ "fmla z21.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[1]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z18.s, z31.s, z0.s[3]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z25.s, z30.s, z6.s[1]\n"
+ "fmla z24.s, z30.s, z6.s[2]\n"
+ "fmla z23.s, z30.s, z6.s[3]\n"
+ "fmla z22.s, z30.s, z7.s[0]\n"
+ "fmla z21.s, z30.s, z0.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[2]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
+ "fmla z18.s, z30.s, z1.s[0]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z25.s, z29.s, z6.s[2]\n"
+ "fmla z24.s, z29.s, z6.s[3]\n"
+ "fmla z23.s, z29.s, z7.s[0]\n"
+ "fmla z22.s, z29.s, z7.s[1]\n"
+ "fmla z21.s, z29.s, z0.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[3]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z1.s[1]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z25.s, z28.s, z6.s[3]\n"
+ "fmla z24.s, z28.s, z7.s[0]\n"
+ "fmla z23.s, z28.s, z7.s[1]\n"
+ "fmla z22.s, z28.s, z7.s[2]\n"
+ "fmla z21.s, z28.s, z0.s[3]\n"
+ "fmla z20.s, z28.s, z1.s[0]\n"
+ "fmla z19.s, z28.s, z1.s[1]\n"
+ "fmla z18.s, z28.s, z1.s[2]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z25.s, z27.s, z7.s[0]\n"
+ "fmla z24.s, z27.s, z7.s[1]\n"
+ "fmla z23.s, z27.s, z7.s[2]\n"
+ "fmla z22.s, z27.s, z7.s[3]\n"
+ "fmla z21.s, z27.s, z1.s[0]\n"
+ "fmla z20.s, z27.s, z1.s[1]\n"
+ "fmla z19.s, z27.s, z1.s[2]\n"
+ "fmla z18.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "fmla z25.s, z31.s, z0.s[0]\n"
+ "fmla z24.s, z31.s, z0.s[1]\n"
+ "fmla z23.s, z31.s, z0.s[2]\n"
+ "fmla z22.s, z31.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
- "fmla z19.s, z31.s, z0.s[0]\n"
- "fmla z18.s, z31.s, z0.s[1]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
- "fmla z16.s, z31.s, z0.s[3]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "fmla z21.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[1]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z18.s, z31.s, z0.s[3]\n"
"mov z0.d, z8.d\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[2]\n"
- "fmla z21.s, z30.s, z0.s[3]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "fmla z25.s, z30.s, z0.s[1]\n"
+ "fmla z24.s, z30.s, z0.s[2]\n"
+ "fmla z23.s, z30.s, z0.s[3]\n"
+ "fmla z22.s, z30.s, z1.s[0]\n"
"mov z0.d, z10.d\n"
- "fmla z20.s, z30.s, z1.s[0]\n"
"mov z1.d, z11.d\n"
- "fmla z19.s, z30.s, z0.s[1]\n"
- "fmla z18.s, z30.s, z0.s[2]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z21.s, z30.s, z0.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[2]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
+ "fmla z18.s, z30.s, z1.s[0]\n"
"mov z0.d, z8.d\n"
- "fmla z16.s, z30.s, z1.s[0]\n"
- "ld1w { z30.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params]]\n"
"mov z1.d, z9.d\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmla z22.s, z29.s, z0.s[3]\n"
+ "fmla z25.s, z29.s, z0.s[2]\n"
+ "fmla z24.s, z29.s, z0.s[3]\n"
+ "fmla z23.s, z29.s, z1.s[0]\n"
+ "fmla z22.s, z29.s, z1.s[1]\n"
"mov z0.d, z10.d\n"
- "fmla z21.s, z29.s, z1.s[0]\n"
- "fmla z20.s, z29.s, z1.s[1]\n"
"mov z1.d, z11.d\n"
- "fmla z19.s, z29.s, z0.s[2]\n"
- "fmla z18.s, z29.s, z0.s[3]\n"
+ "fmla z21.s, z29.s, z0.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[3]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z1.s[1]\n"
"mov z0.d, z8.d\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[1]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #1, MUL VL]\n"
"mov z1.d, z9.d\n"
- "fmla z23.s, z28.s, z0.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[3]\n"
+ "fmla z24.s, z28.s, z1.s[0]\n"
+ "fmla z23.s, z28.s, z1.s[1]\n"
+ "fmla z22.s, z28.s, z1.s[2]\n"
"mov z0.d, z10.d\n"
- "fmla z22.s, z28.s, z1.s[0]\n"
- "fmla z21.s, z28.s, z1.s[1]\n"
- "fmla z20.s, z28.s, z1.s[2]\n"
"mov z1.d, z11.d\n"
- "fmla z19.s, z28.s, z0.s[3]\n"
- "fmla z18.s, z28.s, z1.s[0]\n"
- "fmla z17.s, z28.s, z1.s[1]\n"
- "fmla z16.s, z28.s, z1.s[2]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z21.s, z28.s, z0.s[3]\n"
+ "fmla z20.s, z28.s, z1.s[0]\n"
+ "fmla z19.s, z28.s, z1.s[1]\n"
+ "fmla z18.s, z28.s, z1.s[2]\n"
"mov z1.d, z9.d\n"
- "fmla z23.s, z27.s, z1.s[0]\n"
- "fmla z22.s, z27.s, z1.s[1]\n"
- "fmla z21.s, z27.s, z1.s[2]\n"
- "fmla z20.s, z27.s, z1.s[3]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z25.s, z27.s, z1.s[0]\n"
+ "fmla z24.s, z27.s, z1.s[1]\n"
+ "fmla z23.s, z27.s, z1.s[2]\n"
+ "fmla z22.s, z27.s, z1.s[3]\n"
"mov z1.d, z11.d\n"
- "fmla z19.s, z27.s, z1.s[0]\n"
- "fmla z18.s, z27.s, z1.s[1]\n"
- "fmla z17.s, z27.s, z1.s[2]\n"
- "fmla z16.s, z27.s, z1.s[3]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z22.s, z31.s, z0.s[1]\n"
- "fmla z21.s, z31.s, z0.s[2]\n"
- "fmla z20.s, z31.s, z0.s[3]\n"
+ "fmla z21.s, z27.s, z1.s[0]\n"
+ "fmla z20.s, z27.s, z1.s[1]\n"
+ "fmla z19.s, z27.s, z1.s[2]\n"
+ "fmla z18.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z25.s, z31.s, z0.s[0]\n"
+ "fmla z24.s, z31.s, z0.s[1]\n"
+ "fmla z23.s, z31.s, z0.s[2]\n"
+ "fmla z22.s, z31.s, z0.s[3]\n"
"mov z0.d, z12.d\n"
- "fmla z19.s, z31.s, z0.s[0]\n"
- "fmla z18.s, z31.s, z0.s[1]\n"
- "fmla z17.s, z31.s, z0.s[2]\n"
- "fmla z16.s, z31.s, z0.s[3]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z21.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[1]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z18.s, z31.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
- "fmla z23.s, z30.s, z0.s[1]\n"
- "fmla z22.s, z30.s, z0.s[2]\n"
- "fmla z21.s, z30.s, z0.s[3]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z25.s, z30.s, z0.s[1]\n"
+ "fmla z24.s, z30.s, z0.s[2]\n"
+ "fmla z23.s, z30.s, z0.s[3]\n"
+ "fmla z22.s, z30.s, z1.s[0]\n"
"mov z0.d, z12.d\n"
- "fmla z20.s, z30.s, z1.s[0]\n"
"mov z1.d, z13.d\n"
- "fmla z19.s, z30.s, z0.s[1]\n"
- "fmla z18.s, z30.s, z0.s[2]\n"
- "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z21.s, z30.s, z0.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[2]\n"
+ "fmla z19.s, z30.s, z0.s[3]\n"
+ "fmla z18.s, z30.s, z1.s[0]\n"
"mov z0.d, z10.d\n"
- "fmla z16.s, z30.s, z1.s[0]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
"mov z1.d, z11.d\n"
- "fmla z23.s, z29.s, z0.s[2]\n"
- "fmla z22.s, z29.s, z0.s[3]\n"
+ "fmla z25.s, z29.s, z0.s[2]\n"
+ "fmla z24.s, z29.s, z0.s[3]\n"
+ "fmla z23.s, z29.s, z1.s[0]\n"
+ "fmla z22.s, z29.s, z1.s[1]\n"
"mov z0.d, z12.d\n"
- "fmla z21.s, z29.s, z1.s[0]\n"
- "fmla z20.s, z29.s, z1.s[1]\n"
"mov z1.d, z13.d\n"
- "fmla z19.s, z29.s, z0.s[2]\n"
- "fmla z18.s, z29.s, z0.s[3]\n"
+ "fmla z21.s, z29.s, z0.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[3]\n"
+ "fmla z19.s, z29.s, z1.s[0]\n"
+ "fmla z18.s, z29.s, z1.s[1]\n"
"mov z0.d, z10.d\n"
- "fmla z17.s, z29.s, z1.s[0]\n"
- "fmla z16.s, z29.s, z1.s[1]\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"mov z1.d, z11.d\n"
- "fmla z23.s, z28.s, z0.s[3]\n"
- "mov z0.d, z12.d\n"
- "fmla z22.s, z28.s, z1.s[0]\n"
- "fmla z21.s, z28.s, z1.s[1]\n"
- "fmla z20.s, z28.s, z1.s[2]\n"
+ "fmla z25.s, z28.s, z0.s[3]\n"
+ "fmla z24.s, z28.s, z1.s[0]\n"
+ "fmla z23.s, z28.s, z1.s[1]\n"
+ "fmla z22.s, z28.s, z1.s[2]\n"
"mov z1.d, z13.d\n"
- "fmla z19.s, z28.s, z0.s[3]\n"
- "fmla z18.s, z28.s, z1.s[0]\n"
- "fmla z17.s, z28.s, z1.s[1]\n"
- "fmla z16.s, z28.s, z1.s[2]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z20.s, z28.s, z1.s[0]\n"
+ "fmla z19.s, z28.s, z1.s[1]\n"
+ "fmla z18.s, z28.s, z1.s[2]\n"
"mov z1.d, z11.d\n"
- "fmla z23.s, z27.s, z1.s[0]\n"
- "fmla z22.s, z27.s, z1.s[1]\n"
- "fmla z21.s, z27.s, z1.s[2]\n"
- "fmla z20.s, z27.s, z1.s[3]\n"
+ "fmla z21.s, z28.s, z0.s[3]\n"
+ "fmla z25.s, z27.s, z1.s[0]\n"
+ "fmla z24.s, z27.s, z1.s[1]\n"
+ "fmin z25.s, p1/M, z25.s, z16.s\n"
+ "fmax z25.s, p1/M, z25.s, z17.s\n"
+ "fmla z23.s, z27.s, z1.s[2]\n"
+ "fmla z22.s, z27.s, z1.s[3]\n"
"mov z1.d, z13.d\n"
- "fmla z19.s, z27.s, z1.s[0]\n"
- "fmla z18.s, z27.s, z1.s[1]\n"
- "fmla z17.s, z27.s, z1.s[2]\n"
- "fmla z16.s, z27.s, z1.s[3]\n"
- "fmin z23.s, p2/M, z23.s, z24.s\n"
- "fmin z22.s, p2/M, z22.s, z24.s\n"
- "fmin z21.s, p2/M, z21.s, z24.s\n"
- "fmin z20.s, p2/M, z20.s, z24.s\n"
- "fmax z23.s, p2/M, z23.s, z25.s\n"
- "st1w { z23.s }, p0, [x11, x24, LSL #2]\n"
- "fmax z22.s, p2/M, z22.s, z25.s\n"
- "fmax z21.s, p2/M, z21.s, z25.s\n"
- "ld1w { z23.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmin z24.s, p1/M, z24.s, z16.s\n"
+ "fmla z21.s, z27.s, z1.s[0]\n"
+ "fmla z20.s, z27.s, z1.s[1]\n"
+ "fmin z23.s, p1/M, z23.s, z16.s\n"
+ "fmin z22.s, p1/M, z22.s, z16.s\n"
+ "fmla z19.s, z27.s, z1.s[2]\n"
+ "fmla z18.s, z27.s, z1.s[3]\n"
+ "fmin z21.s, p1/M, z21.s, z16.s\n"
+ "fmin z20.s, p1/M, z20.s, z16.s\n"
+ "fmin z19.s, p1/M, z19.s, z16.s\n"
+ "fmin z18.s, p1/M, z18.s, z16.s\n"
+ "st1w { z25.s }, p0, [x27, x10, LSL #2]\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmax z20.s, p2/M, z20.s, z25.s\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "fmin z19.s, p2/M, z19.s, z24.s\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "fmax z24.s, p1/M, z24.s, z17.s\n"
+ "st1w { z24.s }, p0, [x26, x10, LSL #2]\n"
+ "mov z24.d, z25.d\n"
+ "fmax z23.s, p1/M, z23.s, z17.s\n"
+ "fmax z22.s, p1/M, z22.s, z17.s\n"
+ "st1w { z23.s }, p0, [x25, x10, LSL #2]\n"
+ "mov z23.d, z25.d\n"
+ "fmax z21.s, p1/M, z21.s, z17.s\n"
+ "fmax z20.s, p1/M, z20.s, z17.s\n"
+ "st1w { z22.s }, p0, [x24, x10, LSL #2]\n"
+ "mov z22.d, z25.d\n"
+ "fmax z19.s, p1/M, z19.s, z17.s\n"
+ "fmax z18.s, p1/M, z18.s, z17.s\n"
+ "st1w { z21.s }, p0, [x23, x10, LSL #2]\n"
+ "mov z21.d, z25.d\n"
+ "st1w { z20.s }, p0, [x22, x10, LSL #2]\n"
+ "mov z20.d, z25.d\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "st1w { z19.s }, p0, [x21, x10, LSL #2]\n"
+ "mov z19.d, z25.d\n"
"addvl %x[params], %x[params], #-6\n"
- "fmin z18.s, p2/M, z18.s, z24.s\n"
- "st1w { z22.s }, p0, [x10, x24, LSL #2]\n"
- "mov z22.d, z23.d\n"
- "st1w { z21.s }, p0, [x9, x24, LSL #2]\n"
- "mov z21.d, z23.d\n"
- "st1w { z20.s }, p0, [x28, x24, LSL #2]\n"
- "mov z20.d, z23.d\n"
- "fmax z19.s, p2/M, z19.s, z25.s\n"
- "st1w { z19.s }, p0, [x26, x24, LSL #2]\n"
- "mov z19.d, z23.d\n"
- "fmax z18.s, p2/M, z18.s, z25.s\n"
- "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
- "mov z18.d, z23.d\n"
- "fmin z17.s, p2/M, z17.s, z24.s\n"
- "fmin z16.s, p2/M, z16.s, z24.s\n"
- "fmax z17.s, p2/M, z17.s, z25.s\n"
- "st1w { z17.s }, p0, [x23, x24, LSL #2]\n"
- "mov z17.d, z23.d\n"
- "fmax z16.s, p2/M, z16.s, z25.s\n"
- "st1w { z16.s }, p0, [x22, x24, LSL #2]\n"
- "mov z16.d, z23.d\n"
- "incw x24\n"
+ "st1w { z18.s }, p0, [x20, x10, LSL #2]\n"
+ "incw x10\n"
+ "mov z18.d, z25.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index b23cec8593..a43b81d7e8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,406 +45,406 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ptrue p1.b\n"
- "ld1rw { z11.s }, p1/Z, [%x[minmax_vals]]\n"
- "mov x28, #0x0\n"
- "ld1rw { z10.s }, p1/Z, [%x[minmax_vals], #4]\n"
- "whilelt p0.s, x28, %x[n_output_channels]\n"
+ "mov x9, #0x0\n"
+ "ld1rw { z10.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z13.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x9, %x[n_output_channels]\n"
"1:" // Output channel loop
- "mov z16.b, #0x0\n"
+ "mov z5.b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ld1w { z16.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+ "ld1w { z5.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
"2:" // Output channel loop: Load bias: Done
- "mov z9.d, z16.d\n"
+ "mov x21, %x[inptrs]\n"
+ "ldp x24, x28, [x21], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "mov z16.d, z5.d\n"
+ "mov z17.d, z5.d\n"
+ "mov z18.d, z5.d\n"
+ "ld1rqw { z1.s }, p1/Z, [x24]\n"
+ "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+ "mov z19.d, z5.d\n"
+ "mov z20.d, z5.d\n"
+ "ld1rqw { z0.s }, p1/Z, [x28]\n"
+ "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
+ "mov z21.d, z5.d\n"
+ "mov z22.d, z5.d\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "mov x20, %x[inptrs]\n"
- "mov z31.d, z16.d\n"
- "ldp x24, x27, [x20], #0x10\n"
- "lsr x19, %x[kernel_points], #0x1\n"
- "mov z30.d, z16.d\n"
- "ld1rqw { z7.s }, p1/Z, [x24]\n"
- "mov z29.d, z16.d\n"
"addvl %x[weights], %x[weights], #1\n"
- "mov z28.d, z16.d\n"
- "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "mov z27.d, z16.d\n"
- "ld1rqw { z5.s }, p1/Z, [x27]\n"
- "mov z26.d, z16.d\n"
- "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
- "mov z25.d, z16.d\n"
- "mov z24.d, z16.d\n"
- "mov z23.d, z16.d\n"
- "mov z22.d, z16.d\n"
- "mov z21.d, z16.d\n"
- "mov z20.d, z16.d\n"
- "mov z19.d, z16.d\n"
- "mov z18.d, z16.d\n"
- "mov z17.d, z16.d\n"
- "cbz x19, 6f\n"
- "ldp x24, x27, [x20], #0x10\n"
- "ld1w { z16.s }, p1/Z, [%x[weights]]\n"
- "subs x19, x19, #0x1\n"
+ "mov z23.d, z5.d\n"
+ "mov z24.d, z5.d\n"
+ "mov z25.d, z5.d\n"
+ "mov z26.d, z5.d\n"
+ "mov z27.d, z5.d\n"
+ "mov z28.d, z5.d\n"
+ "mov z29.d, z5.d\n"
+ "mov z30.d, z5.d\n"
+ "mov z31.d, z5.d\n"
+ "cbz x20, 6f\n"
+ "ldp x24, x28, [x21], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "ld1rqw { z5.s }, p1/Z, [x24]\n"
+ "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
+ "ld1rqw { z3.s }, p1/Z, [x28]\n"
+ "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "ld1w { z11.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
- "ld1rqw { z3.s }, p1/Z, [x24]\n"
- "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
- "ld1rqw { z1.s }, p1/Z, [x27]\n"
- "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
- "fmla z9.s, z8.s, z7.s[0]\n"
- "ldp x24, x27, [x20], #0x10\n"
- "subs x19, x19, #0x1\n"
- "fmla z31.s, z8.s, z7.s[1]\n"
- "fmla z30.s, z8.s, z7.s[2]\n"
- "fmla z29.s, z8.s, z7.s[3]\n"
- "ld1rqw { z7.s }, p1/Z, [x24]\n"
- "fmla z28.s, z8.s, z6.s[0]\n"
- "fmla z27.s, z8.s, z6.s[1]\n"
- "fmla z26.s, z8.s, z6.s[2]\n"
- "fmla z25.s, z8.s, z6.s[3]\n"
+ "ldp x24, x28, [x21], #0x10\n"
+ "fmla z16.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z8.s, z1.s[1]\n"
+ "subs x20, x20, #0x1\n"
+ "fmla z18.s, z8.s, z1.s[2]\n"
+ "fmla z19.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x24]\n"
+ "fmla z20.s, z8.s, z6.s[0]\n"
+ "fmla z21.s, z8.s, z6.s[1]\n"
+ "fmla z22.s, z8.s, z6.s[2]\n"
+ "fmla z23.s, z8.s, z6.s[3]\n"
"ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z8.s, z5.s[0]\n"
- "fmla z23.s, z8.s, z5.s[1]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z21.s, z8.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x27]\n"
- "fmla z20.s, z8.s, z4.s[0]\n"
- "fmla z19.s, z8.s, z4.s[1]\n"
- "fmla z18.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z8.s, z4.s[3]\n"
- "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
- "fmla z9.s, z16.s, z3.s[0]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x28]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
+ "ldp x24, x28, [x21], #0x10\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "fmla z31.s, z16.s, z3.s[1]\n"
- "ldp x24, x27, [x20], #0x10\n"
- "fmla z30.s, z16.s, z3.s[2]\n"
- "fmla z29.s, z16.s, z3.s[3]\n"
- "ld1rqw { z3.s }, p1/Z, [x24]\n"
- "fmla z28.s, z16.s, z2.s[0]\n"
- "fmla z27.s, z16.s, z2.s[1]\n"
- "fmla z26.s, z16.s, z2.s[2]\n"
- "fmla z25.s, z16.s, z2.s[3]\n"
- "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z16.s, z1.s[0]\n"
- "fmla z23.s, z16.s, z1.s[1]\n"
- "fmla z22.s, z16.s, z1.s[2]\n"
- "fmla z21.s, z16.s, z1.s[3]\n"
- "ld1rqw { z1.s }, p1/Z, [x27]\n"
- "fmla z20.s, z16.s, z0.s[0]\n"
- "fmla z19.s, z16.s, z0.s[1]\n"
- "fmla z18.s, z16.s, z0.s[2]\n"
- "fmla z17.s, z16.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
- "ld1w { z16.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+ "fmla z16.s, z11.s, z5.s[0]\n"
+ "fmla z17.s, z11.s, z5.s[1]\n"
+ "fmla z18.s, z11.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x24]\n"
+ "fmla z20.s, z11.s, z7.s[0]\n"
+ "fmla z21.s, z11.s, z7.s[1]\n"
+ "fmla z22.s, z11.s, z7.s[2]\n"
+ "fmla z23.s, z11.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x24, #16]\n"
+ "fmla z24.s, z11.s, z3.s[0]\n"
+ "fmla z25.s, z11.s, z3.s[1]\n"
+ "fmla z26.s, z11.s, z3.s[2]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x28]\n"
+ "fmla z28.s, z11.s, z2.s[0]\n"
+ "fmla z29.s, z11.s, z2.s[1]\n"
+ "fmla z30.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z2.s[3]\n"
+ "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
"addvl %x[weights], %x[weights], #2\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla z9.s, z8.s, z7.s[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla z31.s, z8.s, z7.s[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla z30.s, z8.s, z7.s[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla z29.s, z8.s, z7.s[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla z28.s, z8.s, z6.s[0]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla z27.s, z8.s, z6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla z26.s, z8.s, z6.s[2]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla z25.s, z8.s, z6.s[3]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla z24.s, z8.s, z5.s[0]\n"
- "fmla z23.s, z8.s, z5.s[1]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z21.s, z8.s, z5.s[3]\n"
- "fmla z20.s, z8.s, z4.s[0]\n"
- "fmla z19.s, z8.s, z4.s[1]\n"
- "fmla z18.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z8.s, z4.s[3]\n"
- "fmla z9.s, z16.s, z3.s[0]\n"
- "fmla z31.s, z16.s, z3.s[1]\n"
- "fmla z30.s, z16.s, z3.s[2]\n"
- "fmla z29.s, z16.s, z3.s[3]\n"
- "fmla z28.s, z16.s, z2.s[0]\n"
- "fmla z27.s, z16.s, z2.s[1]\n"
- "fmla z26.s, z16.s, z2.s[2]\n"
- "fmla z25.s, z16.s, z2.s[3]\n"
- "fmla z24.s, z16.s, z1.s[0]\n"
- "fmla z23.s, z16.s, z1.s[1]\n"
- "fmla z22.s, z16.s, z1.s[2]\n"
- "fmla z21.s, z16.s, z1.s[3]\n"
- "fmla z20.s, z16.s, z0.s[0]\n"
- "fmla z19.s, z16.s, z0.s[1]\n"
- "fmla z18.s, z16.s, z0.s[2]\n"
- "fmla z17.s, z16.s, z0.s[3]\n"
- "fmin z9.s, p1/M, z9.s, z10.s\n"
- "fmin z31.s, p1/M, z31.s, z10.s\n"
- "fmin z30.s, p1/M, z30.s, z10.s\n"
- "fmin z29.s, p1/M, z29.s, z10.s\n"
- "fmax z9.s, p1/M, z9.s, z11.s\n"
- "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z31.s, p1/M, z31.s, z11.s\n"
- "fmax z30.s, p1/M, z30.s, z11.s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmax z29.s, p1/M, z29.s, z11.s\n"
- "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
- "fmin z28.s, p1/M, z28.s, z10.s\n"
- "fmin z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z26.s, p1/M, z26.s, z10.s\n"
- "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z25.s, p1/M, z25.s, z10.s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin z24.s, p1/M, z24.s, z10.s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax z28.s, p1/M, z28.s, z11.s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax z27.s, p1/M, z27.s, z11.s\n"
- "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z11.s\n"
- "fmax z25.s, p1/M, z25.s, z11.s\n"
- "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z24.s, p1/M, z24.s, z11.s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmin z23.s, p1/M, z23.s, z10.s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmin z22.s, p1/M, z22.s, z10.s\n"
- "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
- "fmin z21.s, p1/M, z21.s, z10.s\n"
- "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
- "fmin z20.s, p1/M, z20.s, z10.s\n"
- "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z23.s, p1/M, z23.s, z11.s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax z22.s, p1/M, z22.s, z11.s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax z21.s, p1/M, z21.s, z11.s\n"
- "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
- "fmax z20.s, p1/M, z20.s, z11.s\n"
- "fmin z19.s, p1/M, z19.s, z10.s\n"
- "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z18.s, p1/M, z18.s, z10.s\n"
- "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z17.s, p1/M, z17.s, z10.s\n"
- "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z19.s, p1/M, z19.s, z11.s\n"
- "fmax z18.s, p1/M, z18.s, z11.s\n"
- "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z17.s, p1/M, z17.s, z11.s\n"
- "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
- "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "fmla z16.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z8.s, z1.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla z18.s, z8.s, z1.s[2]\n"
+ "fmla z19.s, z8.s, z1.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla z20.s, z8.s, z6.s[0]\n"
+ "fmla z21.s, z8.s, z6.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "fmla z22.s, z8.s, z6.s[2]\n"
+ "fmla z23.s, z8.s, z6.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "fmla z16.s, z11.s, z5.s[0]\n"
+ "fmla z17.s, z11.s, z5.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z13.s\n"
+ "fmin z17.s, p1/M, z17.s, z13.s\n"
+ "fmla z18.s, z11.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z5.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z13.s\n"
+ "fmin z19.s, p1/M, z19.s, z13.s\n"
+ "fmla z20.s, z11.s, z7.s[0]\n"
+ "fmla z21.s, z11.s, z7.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z13.s\n"
+ "fmin z21.s, p1/M, z21.s, z13.s\n"
+ "fmla z22.s, z11.s, z7.s[2]\n"
+ "fmla z23.s, z11.s, z7.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z13.s\n"
+ "fmin z23.s, p1/M, z23.s, z13.s\n"
+ "fmla z24.s, z11.s, z3.s[0]\n"
+ "fmla z25.s, z11.s, z3.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "fmla z26.s, z11.s, z3.s[2]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "fmla z28.s, z11.s, z2.s[0]\n"
+ "fmla z29.s, z11.s, z2.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "fmla z30.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z2.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
+ "fmin z24.s, p1/M, z24.s, z13.s\n"
+ "fmin z25.s, p1/M, z25.s, z13.s\n"
+ "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z13.s\n"
+ "fmin z27.s, p1/M, z27.s, z13.s\n"
+ "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z13.s\n"
+ "fmin z29.s, p1/M, z29.s, z13.s\n"
+ "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z13.s\n"
+ "fmin z31.s, p1/M, z31.s, z13.s\n"
+ "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
+ "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
+ "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla z9.s, z8.s, z7.s[0]\n"
- "ldp x24, x27, [x20], #0x10\n"
- "fmla z31.s, z8.s, z7.s[1]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla z30.s, z8.s, z7.s[2]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla z29.s, z8.s, z7.s[3]\n"
- "ld1rqw { z7.s }, p1/Z, [x24]\n"
- "fmla z28.s, z8.s, z6.s[0]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla z27.s, z8.s, z6.s[1]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla z26.s, z8.s, z6.s[2]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla z25.s, z8.s, z6.s[3]\n"
+ "fmla z16.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z8.s, z1.s[1]\n"
+ "ldp x24, x28, [x21], #0x10\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "fmla z18.s, z8.s, z1.s[2]\n"
+ "fmla z19.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x24]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla z20.s, z8.s, z6.s[0]\n"
+ "fmla z21.s, z8.s, z6.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla z22.s, z8.s, z6.s[2]\n"
+ "fmla z23.s, z8.s, z6.s[3]\n"
"ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
- "fmla z24.s, z8.s, z5.s[0]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla z23.s, z8.s, z5.s[1]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla z21.s, z8.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x27]\n"
- "fmla z20.s, z8.s, z4.s[0]\n"
- "fmla z19.s, z8.s, z4.s[1]\n"
- "fmla z18.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z8.s, z4.s[3]\n"
- "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
- "fmla z9.s, z16.s, z3.s[0]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x28]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
"ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "ld1rqw { z4.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z5.s[0]\n"
+ "fmla z17.s, z11.s, z5.s[1]\n"
"addvl %x[weights], %x[weights], #1\n"
- "fmla z31.s, z16.s, z3.s[1]\n"
- "fmla z30.s, z16.s, z3.s[2]\n"
- "fmla z29.s, z16.s, z3.s[3]\n"
- "fmla z28.s, z16.s, z2.s[0]\n"
- "fmla z27.s, z16.s, z2.s[1]\n"
- "fmla z26.s, z16.s, z2.s[2]\n"
- "fmla z25.s, z16.s, z2.s[3]\n"
- "fmla z24.s, z16.s, z1.s[0]\n"
- "fmla z23.s, z16.s, z1.s[1]\n"
- "fmla z22.s, z16.s, z1.s[2]\n"
- "fmla z21.s, z16.s, z1.s[3]\n"
- "fmla z20.s, z16.s, z0.s[0]\n"
- "fmla z19.s, z16.s, z0.s[1]\n"
- "fmla z18.s, z16.s, z0.s[2]\n"
- "fmla z17.s, z16.s, z0.s[3]\n"
- "fmla z9.s, z8.s, z7.s[0]\n"
- "fmla z31.s, z8.s, z7.s[1]\n"
- "fmla z30.s, z8.s, z7.s[2]\n"
- "fmla z29.s, z8.s, z7.s[3]\n"
- "fmla z28.s, z8.s, z6.s[0]\n"
- "fmla z27.s, z8.s, z6.s[1]\n"
- "fmla z26.s, z8.s, z6.s[2]\n"
- "fmla z25.s, z8.s, z6.s[3]\n"
- "fmla z24.s, z8.s, z5.s[0]\n"
- "fmla z23.s, z8.s, z5.s[1]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z21.s, z8.s, z5.s[3]\n"
- "fmla z20.s, z8.s, z4.s[0]\n"
- "fmla z19.s, z8.s, z4.s[1]\n"
- "fmla z18.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z8.s, z4.s[3]\n"
- "fmin z9.s, p1/M, z9.s, z10.s\n"
- "fmin z31.s, p1/M, z31.s, z10.s\n"
- "fmin z30.s, p1/M, z30.s, z10.s\n"
- "fmin z29.s, p1/M, z29.s, z10.s\n"
- "fmax z9.s, p1/M, z9.s, z11.s\n"
- "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z31.s, p1/M, z31.s, z11.s\n"
- "fmax z30.s, p1/M, z30.s, z11.s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmax z29.s, p1/M, z29.s, z11.s\n"
- "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
- "fmin z28.s, p1/M, z28.s, z10.s\n"
- "fmin z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z26.s, p1/M, z26.s, z10.s\n"
- "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z25.s, p1/M, z25.s, z10.s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin z24.s, p1/M, z24.s, z10.s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax z28.s, p1/M, z28.s, z11.s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax z27.s, p1/M, z27.s, z11.s\n"
- "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z11.s\n"
- "fmax z25.s, p1/M, z25.s, z11.s\n"
- "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z24.s, p1/M, z24.s, z11.s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmin z23.s, p1/M, z23.s, z10.s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmin z22.s, p1/M, z22.s, z10.s\n"
- "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
- "fmin z21.s, p1/M, z21.s, z10.s\n"
- "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
- "fmin z20.s, p1/M, z20.s, z10.s\n"
- "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z23.s, p1/M, z23.s, z11.s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax z22.s, p1/M, z22.s, z11.s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax z21.s, p1/M, z21.s, z11.s\n"
- "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
- "fmax z20.s, p1/M, z20.s, z11.s\n"
- "fmin z19.s, p1/M, z19.s, z10.s\n"
- "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z18.s, p1/M, z18.s, z10.s\n"
- "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z17.s, p1/M, z17.s, z10.s\n"
- "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z19.s, p1/M, z19.s, z11.s\n"
- "fmax z18.s, p1/M, z18.s, z11.s\n"
- "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z17.s, p1/M, z17.s, z11.s\n"
- "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
- "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "fmla z18.s, z11.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z5.s[3]\n"
+ "fmla z20.s, z11.s, z7.s[0]\n"
+ "fmla z21.s, z11.s, z7.s[1]\n"
+ "fmla z22.s, z11.s, z7.s[2]\n"
+ "fmla z23.s, z11.s, z7.s[3]\n"
+ "fmla z24.s, z11.s, z3.s[0]\n"
+ "fmla z25.s, z11.s, z3.s[1]\n"
+ "fmla z26.s, z11.s, z3.s[2]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z2.s[0]\n"
+ "fmla z29.s, z11.s, z2.s[1]\n"
+ "fmla z30.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z2.s[3]\n"
+ "fmla z16.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z8.s, z1.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z13.s\n"
+ "fmin z17.s, p1/M, z17.s, z13.s\n"
+ "fmla z18.s, z8.s, z1.s[2]\n"
+ "fmla z19.s, z8.s, z1.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z13.s\n"
+ "fmin z19.s, p1/M, z19.s, z13.s\n"
+ "fmla z20.s, z8.s, z6.s[0]\n"
+ "fmla z21.s, z8.s, z6.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z13.s\n"
+ "fmin z21.s, p1/M, z21.s, z13.s\n"
+ "fmla z22.s, z8.s, z6.s[2]\n"
+ "fmla z23.s, z8.s, z6.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z13.s\n"
+ "fmin z23.s, p1/M, z23.s, z13.s\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
+ "fmin z24.s, p1/M, z24.s, z13.s\n"
+ "fmin z25.s, p1/M, z25.s, z13.s\n"
+ "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z13.s\n"
+ "fmin z27.s, p1/M, z27.s, z13.s\n"
+ "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z13.s\n"
+ "fmin z29.s, p1/M, z29.s, z13.s\n"
+ "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z13.s\n"
+ "fmin z31.s, p1/M, z31.s, z13.s\n"
+ "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
+ "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
+ "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla z9.s, z8.s, z7.s[0]\n"
- "ldr x19, [%x[outptrs], #0x0]\n"
- "fmla z31.s, z8.s, z7.s[1]\n"
- "ldr x20, [%x[outptrs], #0x8]\n"
- "fmla z30.s, z8.s, z7.s[2]\n"
- "ldr x21, [%x[outptrs], #0x10]\n"
- "fmla z29.s, z8.s, z7.s[3]\n"
- "ldr x22, [%x[outptrs], #0x18]\n"
- "fmla z28.s, z8.s, z6.s[0]\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "fmla z27.s, z8.s, z6.s[1]\n"
- "ldr x24, [%x[outptrs], #0x28]\n"
- "fmla z26.s, z8.s, z6.s[2]\n"
- "ldr x25, [%x[outptrs], #0x30]\n"
- "fmla z25.s, z8.s, z6.s[3]\n"
- "ldr x26, [%x[outptrs], #0x38]\n"
- "fmla z24.s, z8.s, z5.s[0]\n"
- "fmla z23.s, z8.s, z5.s[1]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z21.s, z8.s, z5.s[3]\n"
- "fmla z20.s, z8.s, z4.s[0]\n"
- "fmla z19.s, z8.s, z4.s[1]\n"
- "fmla z18.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z8.s, z4.s[3]\n"
- "fmin z9.s, p1/M, z9.s, z10.s\n"
- "fmin z31.s, p1/M, z31.s, z10.s\n"
- "fmin z30.s, p1/M, z30.s, z10.s\n"
- "fmin z29.s, p1/M, z29.s, z10.s\n"
- "fmax z9.s, p1/M, z9.s, z11.s\n"
- "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z31.s, p1/M, z31.s, z11.s\n"
- "fmax z30.s, p1/M, z30.s, z11.s\n"
- "ldr x19, [%x[outptrs], #0x40]\n"
- "fmax z29.s, p1/M, z29.s, z11.s\n"
- "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
- "fmin z28.s, p1/M, z28.s, z10.s\n"
- "fmin z27.s, p1/M, z27.s, z10.s\n"
- "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z26.s, p1/M, z26.s, z10.s\n"
- "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z25.s, p1/M, z25.s, z10.s\n"
- "ldr x20, [%x[outptrs], #0x48]\n"
- "fmin z24.s, p1/M, z24.s, z10.s\n"
- "ldr x21, [%x[outptrs], #0x50]\n"
- "fmax z28.s, p1/M, z28.s, z11.s\n"
- "ldr x22, [%x[outptrs], #0x58]\n"
- "fmax z27.s, p1/M, z27.s, z11.s\n"
- "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z11.s\n"
- "fmax z25.s, p1/M, z25.s, z11.s\n"
- "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z24.s, p1/M, z24.s, z11.s\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
- "fmin z23.s, p1/M, z23.s, z10.s\n"
- "ldr x24, [%x[outptrs], #0x68]\n"
- "fmin z22.s, p1/M, z22.s, z10.s\n"
- "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
- "fmin z21.s, p1/M, z21.s, z10.s\n"
- "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
- "fmin z20.s, p1/M, z20.s, z10.s\n"
- "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
- "fmax z23.s, p1/M, z23.s, z11.s\n"
- "ldr x25, [%x[outptrs], #0x70]\n"
- "fmax z22.s, p1/M, z22.s, z11.s\n"
- "ldr x26, [%x[outptrs], #0x78]\n"
- "fmax z21.s, p1/M, z21.s, z11.s\n"
- "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
- "fmax z20.s, p1/M, z20.s, z11.s\n"
- "fmin z19.s, p1/M, z19.s, z10.s\n"
- "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
- "fmin z18.s, p1/M, z18.s, z10.s\n"
- "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
- "fmin z17.s, p1/M, z17.s, z10.s\n"
- "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
- "fmax z19.s, p1/M, z19.s, z11.s\n"
- "fmax z18.s, p1/M, z18.s, z11.s\n"
- "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
- "fmax z17.s, p1/M, z17.s, z11.s\n"
- "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
- "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "fmla z16.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z8.s, z1.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z13.s\n"
+ "fmin z17.s, p1/M, z17.s, z13.s\n"
+ "fmla z18.s, z8.s, z1.s[2]\n"
+ "fmla z19.s, z8.s, z1.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z13.s\n"
+ "fmin z19.s, p1/M, z19.s, z13.s\n"
+ "fmla z20.s, z8.s, z6.s[0]\n"
+ "fmla z21.s, z8.s, z6.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z13.s\n"
+ "fmin z21.s, p1/M, z21.s, z13.s\n"
+ "fmla z22.s, z8.s, z6.s[2]\n"
+ "fmla z23.s, z8.s, z6.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z13.s\n"
+ "fmin z23.s, p1/M, z23.s, z13.s\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x8]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x10]\n"
+ "ldr x23, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x20]\n"
+ "ldr x25, [%x[outptrs], #0x28]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x30]\n"
+ "ldr x27, [%x[outptrs], #0x38]\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "st1w { z16.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "st1w { z17.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x48]\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "st1w { z18.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x50]\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
+ "st1w { z19.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x58]\n"
+ "fmin z24.s, p1/M, z24.s, z13.s\n"
+ "fmin z25.s, p1/M, z25.s, z13.s\n"
+ "st1w { z20.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x60]\n"
+ "fmin z26.s, p1/M, z26.s, z13.s\n"
+ "fmin z27.s, p1/M, z27.s, z13.s\n"
+ "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x68]\n"
+ "fmin z28.s, p1/M, z28.s, z13.s\n"
+ "fmin z29.s, p1/M, z29.s, z13.s\n"
+ "st1w { z22.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x70]\n"
+ "fmin z30.s, p1/M, z30.s, z13.s\n"
+ "fmin z31.s, p1/M, z31.s, z13.s\n"
+ "st1w { z23.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x78]\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
+ "st1w { z24.s }, p0, [x20, x9, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z25.s }, p0, [x21, x9, LSL #2]\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "st1w { z26.s }, p0, [x22, x9, LSL #2]\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
+ "st1w { z27.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x27, x9, LSL #2]\n"
"7:" // Output channel loop: Done
- "incw x28\n"
- "whilelt p0.s, x28, %x[n_output_channels]\n"
+ "incw x9\n"
+ "whilelt p0.s, x9, %x[n_output_channels]\n"
"b.any 1b\n"
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 800803770a..4eae5961a0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,421 +41,461 @@ void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "mov x19, #0x1\n"
- "ldp x25, x24, [%x[inptrs], #0x30]\n"
- "orr x19, x19, #0x100\n"
- "ldp x23, x22, [%x[outptrs], #0x0]\n"
- "orr x19, x19, #0x10000\n"
- "dup z12.s, w19\n"
- "ldp x21, x20, [%x[outptrs], #0x10]\n"
- "mov x19, #0x0\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "whilelt p1.b, x19, %x[n_channels]\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "mov x13, #0x0\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x20, #0x1\n"
+ "ptrue p1.b\n"
+ "ldp x24, x23, [%x[outptrs], #0x0]\n"
+ "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "orr x20, x20, #0x100\n"
+ "orr x20, x20, #0x10000\n"
+ "ld1b { z14.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z13.b }, p2/Z, [x11, x13]\n"
+ "dup z12.s, w20\n"
+ "mov x20, #0x0\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "ld1b { z11.b }, p2/Z, [x10, x13]\n"
+ "zip2 z10.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ld1b { z9.b }, p2/Z, [x9, x13]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "zip1 z11.b, z13.b, z9.b\n"
+ "zip2 z9.b, z13.b, z9.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z7.b }, p2/Z, [x27, x13]\n"
+ "zip2 z13.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ld1b { z6.b }, p2/Z, [x26, x13]\n"
+ "zip1 z11.b, z10.b, z9.b\n"
+ "zip2 z9.b, z10.b, z9.b\n"
+ "ld1b { z5.b }, p2/Z, [x25, x13]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip2 z4.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "ld1b { z3.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z2.b }, p2/Z, [x11, x13]\n"
+ "zip1 z6.b, z7.b, z5.b\n"
+ "zip2 z5.b, z7.b, z5.b\n"
+ "ld1b { z1.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z0.b }, p2/Z, [x9, x13]\n"
+ "zip2 z31.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ld1b { z30.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z29.b }, p2/Z, [x27, x13]\n"
+ "zip1 z1.b, z2.b, z0.b\n"
+ "zip2 z0.b, z2.b, z0.b\n"
+ "ld1b { z28.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z27.b }, p2/Z, [x25, x13]\n"
+ "zip2 z26.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z29.b, z27.b\n"
+ "zip2 z27.b, z29.b, z27.b\n"
+ "ld1w { z10.s }, p1/Z, [%x[params]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z7.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "zip1 z6.b, z4.b, z5.b\n"
+ "zip2 z5.b, z4.b, z5.b\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "zip2 z2.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip1 z1.b, z31.b, z0.b\n"
+ "zip2 z0.b, z31.b, z0.b\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z29.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z28.b, z26.b, z27.b\n"
+ "zip2 z27.b, z26.b, z27.b\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z4.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z26.d, z10.d\n"
"1:" // Loop
- "mov z7.s, #0x0\n"
- "ld1b { z19.b }, p1/Z, [x11, x19]\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "mov z6.s, #0x0\n"
- "ld1b { z18.b }, p1/Z, [x10, x19]\n"
- "ldp x11, x10, [%x[inptrs], #0x40]\n"
- "ld1b { z16.b }, p1/Z, [x9, x19]\n"
- "zip1 z21.b, z19.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x28, x19]\n"
- "zip2 z19.b, z19.b, z16.b\n"
- "ldp x9, x28, [%x[inptrs], #0x50]\n"
- "ld1b { z23.b }, p1/Z, [x27, x19]\n"
- "zip1 z16.b, z18.b, z17.b\n"
- "ld1b { z20.b }, p1/Z, [x26, x19]\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x60]\n"
- "zip1 z5.b, z21.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x25, x19]\n"
- "zip2 z4.b, z21.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x19]\n"
- "zip1 z29.b, z19.b, z18.b\n"
- "ldp x25, x24, [%x[inptrs], #0x70]\n"
- "zip2 z28.b, z19.b, z18.b\n"
- "ld1b { z22.b }, p1/Z, [x11, x19]\n"
- "zip1 z19.b, z23.b, z17.b\n"
- "ld1b { z21.b }, p1/Z, [x10, x19]\n"
- "zip2 z27.b, z23.b, z17.b\n"
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "zip1 z18.b, z20.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x9, x19]\n"
- "zip2 z20.b, z20.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x28, x19]\n"
- "zip1 z3.b, z19.b, z18.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "zip2 z2.b, z19.b, z18.b\n"
- "ld1b { z19.b }, p1/Z, [x27, x19]\n"
- "zip1 z26.b, z22.b, z17.b\n"
- "ld1b { z25.b }, p1/Z, [x26, x19]\n"
- "zip2 z24.b, z22.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "zip1 z23.b, z21.b, z16.b\n"
- "ld1b { z18.b }, p1/Z, [x25, x19]\n"
- "zip2 z22.b, z21.b, z16.b\n"
- "ld1b { z21.b }, p1/Z, [x24, x19]\n"
- "zip1 z17.b, z27.b, z20.b\n"
- "ldp x25, x24, [%x[inptrs], #0x30]\n"
- "zip2 z16.b, z27.b, z20.b\n"
- "st1b { z29.b }, p2, [SP]\n"
- "zip1 z20.b, z19.b, z18.b\n"
- "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
- "zip2 z19.b, z19.b, z18.b\n"
- "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
- "zip1 z18.b, z25.b, z21.b\n"
- "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
- "zip2 z17.b, z25.b, z21.b\n"
- "ld1w { z1.s }, p2/Z, [%x[params]]\n"
- "zip1 z0.b, z26.b, z23.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z30.b, z26.b, z23.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
- "zip2 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
- "zip1 z28.b, z20.b, z18.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "zip2 z26.b, z20.b, z18.b\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
- "zip2 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
- "mov z24.d, z1.d\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "mov z22.d, z1.d\n"
- "mov z21.d, z1.d\n"
- "sdot z1.s, z31.b, z5.b\n"
- "sdot z22.s, z31.b, z3.b\n"
- "sdot z7.s, z12.b, z3.b\n"
- "sdot z1.s, z29.b, z3.b\n"
+ "mov z19.s, #0x0\n"
+ "sdot z19.s, z12.b, z8.b\n"
+ "sdot z10.s, z21.b, z14.b\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "sdot z19.s, z12.b, z3.b\n"
+ "sdot z31.s, z21.b, z8.b\n"
+ "incw x13, ALL, MUL #4\n"
+ "sdot z10.s, z16.b, z8.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "movprfx z18, z19\n sdot z18.s, z12.b, z30.b\n"
+ "sdot z19.s, z12.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "sdot z31.s, z16.b, z3.b\n"
+ "sdot z10.s, z20.b, z3.b\n"
"ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z22.s, z29.b, z0.b\n"
- "sdot z7.s, z12.b, z0.b\n"
- "sdot z1.s, z27.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z22.s, z27.b, z28.b\n"
- "mov z20.d, z7.d\n"
- "sdot z7.s, z12.b, z5.b\n"
- "sdot z20.s, z12.b, z28.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "sdot z21.s, z31.b, z3.b\n"
- "sdot z6.s, z12.b, z3.b\n"
- "sdot z24.s, z31.b, z5.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "sdot z21.s, z29.b, z0.b\n"
- "sdot z6.s, z12.b, z0.b\n"
- "sdot z24.s, z29.b, z3.b\n"
- "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "sdot z21.s, z27.b, z28.b\n"
- "mov z19.d, z6.d\n"
- "sdot z24.s, z27.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
- "sdot z6.s, z12.b, z5.b\n"
- "ld1b { z5.b }, p2/Z, [SP]\n"
- "sdot z19.s, z12.b, z28.b\n"
- "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "mov z7.s, #0x0\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "sdot z7.s, z12.b, z2.b\n"
- "mov z6.s, #0x0\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- "sdot z7.s, z12.b, z30.b\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z17.d, z22.d, z23.d\n"
- "mov z20.d, z7.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "sdot z7.s, z12.b, z4.b\n"
- "sdot z20.s, z12.b, z26.b\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z1.s, z1.s, z8.s\n"
- "and z16.d, z21.d, z23.d\n"
+ "sdot z4.s, z21.b, z14.b\n"
+ "sdot z26.s, z21.b, z8.b\n"
+ "mov z17.s, #0x0\n"
+ "sdot z17.s, z12.b, z8.b\n"
+ "sdot z17.s, z12.b, z3.b\n"
+ "sdot z31.s, z20.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z4.s, z16.b, z8.b\n"
+ "sdot z26.s, z16.b, z3.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "movprfx z16, z17\n sdot z16.s, z12.b, z30.b\n"
+ "mov z19.s, #0x0\n"
+ "sdot z17.s, z12.b, z14.b\n"
+ "ld1w { z14.s }, p1/Z, [%x[params]]\n"
+ "sdot z4.s, z20.b, z3.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "sdot z26.s, z20.b, z30.b\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "and z21.d, z10.d, z8.d\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ "sdot z19.s, z12.b, z7.b\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "sdot z19.s, z12.b, z2.b\n"
+ "and z16.d, z4.d, z8.d\n"
+ "and z20.d, z31.d, z8.d\n"
+ "movprfx z18, z19\n sdot z18.s, z12.b, z29.b\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z22.s, z22.s, z8.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "mov z22.d, z1.d\n"
- "sdot z22.s, z31.b, z2.b\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "sdot z22.s, z29.b, z30.b\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "sdot z1.s, z31.b, z4.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "sdot z22.s, z27.b, z26.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- "sdot z1.s, z29.b, z2.b\n"
+ "sdot z19.s, z12.b, z13.b\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "sdot z31.s, z21.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "sdot z10.s, z21.b, z13.b\n"
+ "sdot z10.s, z16.b, z7.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ "sdot z4.s, z21.b, z13.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "mov z17.s, #0x0\n"
+ "sdot z26.s, z21.b, z7.b\n"
+ "sdot z17.s, z12.b, z7.b\n"
+ "incw x20\n"
+ "sdot z31.s, z16.b, z2.b\n"
+ "sdot z10.s, z20.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z24.s, z31.b, z4.b\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "sdot z1.s, z27.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z21.s, z31.b, z2.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z24.s, z29.b, z2.b\n"
- "sdot z6.s, z12.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "sdot z21.s, z29.b, z30.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "sdot z24.s, z27.b, z30.b\n"
- "sdot z6.s, z12.b, z30.b\n"
- "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
- "and z17.d, z22.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "sdot z21.s, z27.b, z26.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- "mov z19.d, z6.d\n"
- "sdot z6.s, z12.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
- "sdot z19.s, z12.b, z26.b\n"
- "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "mov z7.s, #0x0\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "sdot z7.s, z12.b, z3.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mov z6.s, #0x0\n"
- "sdot z7.s, z12.b, z0.b\n"
- "and z16.d, z1.d, z23.d\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "sdot z4.s, z16.b, z7.b\n"
+ "sdot z26.s, z16.b, z2.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z17.s, z12.b, z2.b\n"
+ "sdot z31.s, z20.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "sdot z4.s, z20.b, z2.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "sdot z26.s, z20.b, z29.b\n"
+ "movprfx z16, z17\n sdot z16.s, z12.b, z29.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "sdot z17.s, z12.b, z13.b\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "mov z20.d, z7.d\n"
- "sdot z7.s, z12.b, z5.b\n"
- "sdot z20.s, z12.b, z28.b\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params]]\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "and z16.d, z21.d, z23.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "mov z19.s, #0x0\n"
+ "sdot z19.s, z12.b, z6.b\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "sdot z19.s, z12.b, z1.b\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "sdot z31.s, z21.b, z6.b\n"
+ "movprfx z18, z19\n sdot z18.s, z12.b, z28.b\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "sdot z10.s, z21.b, z11.b\n"
+ "sdot z10.s, z16.b, z6.b\n"
+ "sdot z19.s, z12.b, z11.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z4.s, z21.b, z11.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "mov z17.s, #0x0\n"
+ "sdot z26.s, z21.b, z6.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z17.s, z12.b, z6.b\n"
+ "sdot z31.s, z16.b, z1.b\n"
+ "incw x20\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "sdot z10.s, z20.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z4.s, z16.b, z6.b\n"
+ "sdot z26.s, z16.b, z1.b\n"
+ "sdot z17.s, z12.b, z1.b\n"
+ "sdot z31.s, z20.b, z28.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "sdot z4.s, z20.b, z1.b\n"
+ "sdot z26.s, z20.b, z28.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "movprfx z16, z17\n sdot z16.s, z12.b, z28.b\n"
+ "sdot z17.s, z12.b, z11.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "add z22.s, z22.s, z8.s\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z1.s, z1.s, z8.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "mov z22.d, z1.d\n"
- "sdot z22.s, z31.b, z3.b\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "sdot z1.s, z31.b, z5.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "sdot z22.s, z29.b, z0.b\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params]]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "mov z19.s, #0x0\n"
+ "sdot z19.s, z12.b, z5.b\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "sdot z19.s, z12.b, z0.b\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "sdot z31.s, z21.b, z5.b\n"
+ "movprfx z18, z19\n sdot z18.s, z12.b, z27.b\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "sdot z10.s, z21.b, z9.b\n"
+ "sdot z10.s, z16.b, z5.b\n"
+ "sdot z19.s, z12.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z4.s, z21.b, z9.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "sdot z1.s, z29.b, z3.b\n"
- "sdot z22.s, z27.b, z28.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "sdot z24.s, z31.b, z5.b\n"
- "sdot z1.s, z27.b, z0.b\n"
+ "mov z17.s, #0x0\n"
+ "sdot z26.s, z21.b, z5.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z17.s, z12.b, z5.b\n"
+ "sdot z31.s, z16.b, z0.b\n"
+ "incw x20\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "sdot z10.s, z20.b, z0.b\n"
"ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z21.s, z31.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z24.s, z29.b, z3.b\n"
- "sdot z6.s, z12.b, z3.b\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "sdot z21.s, z29.b, z0.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "sdot z24.s, z27.b, z0.b\n"
- "sdot z6.s, z12.b, z0.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "sdot z21.s, z27.b, z28.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "mov z7.s, #0x0\n"
- "mov z19.d, z6.d\n"
- "sdot z6.s, z12.b, z5.b\n"
- "sdot z19.s, z12.b, z28.b\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sdot z7.s, z12.b, z2.b\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mov z6.s, #0x0\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z17.d, z22.d, z23.d\n"
- "and z16.d, z21.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "sdot z7.s, z12.b, z30.b\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "sdot z4.s, z16.b, z5.b\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "sdot z26.s, z16.b, z0.b\n"
+ "sdot z17.s, z12.b, z0.b\n"
+ "ld1b { z13.b }, p2/Z, [x11, x13]\n"
+ "ld1b { z11.b }, p2/Z, [x10, x13]\n"
+ "sdot z31.s, z20.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "ld1b { z7.b }, p2/Z, [x27, x13]\n"
+ "sdot z4.s, z20.b, z0.b\n"
+ "sdot z26.s, z20.b, z27.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "ld1b { z6.b }, p2/Z, [x26, x13]\n"
+ "movprfx z16, z17\n sdot z16.s, z12.b, z27.b\n"
+ "sdot z17.s, z12.b, z9.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "ld1b { z9.b }, p2/Z, [x9, x13]\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "ld1b { z5.b }, p2/Z, [x25, x13]\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1b { z14.b }, p2/Z, [x12, x13]\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
+ "ld1b { z3.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z2.b }, p2/Z, [x11, x13]\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "add z1.s, z1.s, z8.s\n"
- "mov z20.d, z7.d\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "sdot z7.s, z12.b, z4.b\n"
- "sdot z20.s, z12.b, z26.b\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "add z22.s, z22.s, z8.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "ld1b { z1.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z0.b }, p2/Z, [x9, x13]\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "ld1b { z8.b }, p2/Z, [x28, x13]\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1b { z30.b }, p2/Z, [x28, x13]\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "ld1b { z29.b }, p2/Z, [x27, x13]\n"
+ "ld1b { z28.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z27.b }, p2/Z, [x25, x13]\n"
+ "zip2 z10.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "zip1 z11.b, z13.b, z9.b\n"
+ "zip2 z9.b, z13.b, z9.b\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "zip2 z13.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "zip1 z11.b, z10.b, z9.b\n"
+ "zip2 z9.b, z10.b, z9.b\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "zip2 z4.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "incw x20\n"
+ "zip1 z6.b, z7.b, z5.b\n"
+ "zip2 z5.b, z7.b, z5.b\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z1.b, z2.b, z0.b\n"
+ "zip2 z0.b, z2.b, z0.b\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "add z21.s, z21.s, z8.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "mov z22.d, z1.d\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "sdot z1.s, z31.b, z4.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "sdot z22.s, z31.b, z2.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "sdot z1.s, z29.b, z2.b\n"
- "sdot z22.s, z29.b, z30.b\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z24.s, z31.b, z4.b\n"
- "sdot z1.s, z27.b, z30.b\n"
- "sdot z22.s, z27.b, z26.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- "sdot z21.s, z31.b, z2.b\n"
- "sdot z24.s, z29.b, z2.b\n"
- "sdot z6.s, z12.b, z2.b\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "sdot z21.s, z29.b, z30.b\n"
- "sdot z24.s, z27.b, z30.b\n"
- "sdot z6.s, z12.b, z30.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "sdot z21.s, z27.b, z26.b\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "mov z19.d, z6.d\n"
- "sdot z6.s, z12.b, z4.b\n"
- "sdot z19.s, z12.b, z26.b\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "and z17.d, z22.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z16.d, z21.d, z23.d\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "add z1.s, z1.s, z8.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z22.s, z22.s, z8.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "incw x19\n"
- "whilelt p1.b, x19, %x[n_channels]\n"
+ "zip2 z26.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z29.b, z27.b\n"
+ "zip2 z27.b, z29.b, z27.b\n"
+ "zip2 z7.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "zip1 z6.b, z4.b, z5.b\n"
+ "zip2 z5.b, z4.b, z5.b\n"
+ "zip2 z2.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "zip1 z1.b, z31.b, z0.b\n"
+ "zip2 z0.b, z31.b, z0.b\n"
+ "zip2 z29.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z26.b, z27.b\n"
+ "zip2 z27.b, z26.b, z27.b\n"
+ "mov z4.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z26.d, z10.d\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 3583308357..4ebf5be285 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,324 +91,316 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x8, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x8\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x15, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x14, #0x0\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z12.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z18.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z13.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x15, x17\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x15, x17\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "mov x19, x15\n"
- "incw x19\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "whilelt p1.s, x19, x17\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z17.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z17.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "mov z9.d, z11.d\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
+ "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x15, x14, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x13, x12, [x24, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x23, x17\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
"ld1sb { z0.h }, p4/Z, [x16]\n"
- ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n"
- "mov z20.d, z17.d\n"
"ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "mov z24.d, z11.d\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
"ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n"
- "mov z19.d, z17.d\n"
"ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- "mov z26.d, z11.d\n"
+ ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
"ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n"
- "mov z23.d, z17.d\n"
"ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n"
+ ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
"ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n"
"inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
"ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n"
- ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n"
- ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n"
- "ldr x19, [x12, #0x20]\n"
- "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
- "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
- "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
- "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "mov z26.d, z13.d\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z24.d, z13.d\n"
+ "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z16.d, z17.d\n"
+ "mov z25.d, z13.d\n"
+ "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
+ "mov z9.d, z17.d\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
"1:" // Loop
- ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n"
- "ldr x21, [x12, #0x28]\n"
- "whilelt p0.h, x14, x17\n"
+ ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x20, [x12, #0x30]\n"
- "inch x16\n"
- ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n"
- "ldr x26, [x12, #0x38]\n"
- ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n"
- "ldr x25, [x12, #0x40]\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- "ldr x19, [x12, #0x48]\n"
- ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n"
- "ldr x24, [x12, #0x50]\n"
- ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n"
- "ldr x23, [x12, #0x58]\n"
- ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z31.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x12, #0x60]\n"
+ "ldr x22, [x11, #0x28]\n"
+ "ldr x27, [x11, #0x38]\n"
+ ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ "ldr x21, [x11, #0x30]\n"
+ "ldr x26, [x11, #0x40]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z30.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
- ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
- "ldr x21, [x12, #0x68]\n"
- ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
- "ldr x20, [x12, #0x70]\n"
+ "ldr x20, [x11, #0x48]\n"
+ "ld1sb { z30.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
+ "ldr x25, [x11, #0x50]\n"
+ "ldr x24, [x11, #0x58]\n"
+ ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
+ "ld1sb { z31.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- "ldr x19, [x12, #0x78]\n"
- ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n"
- "ld1w { z25.s }, p2/Z, [x13]\n"
- ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
- "addvl x13, x13, #2\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
+ "ldr x23, [x11, #0x60]\n"
+ ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ "ldr x22, [x11, #0x68]\n"
+ "ldr x21, [x11, #0x70]\n"
".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n"
- ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n"
- "uzp1 z10.s, z25.s, z16.s\n"
- "uzp2 z22.s, z25.s, z16.s\n"
- "ld1w { z25.s }, p2/Z, [x11]\n"
- ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x26, x15]\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
- ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n"
- "ld1sb { z31.h }, p3/Z, [x25, x15]\n"
- "addvl x11, x11, #2\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- "uzp1 z21.s, z25.s, z16.s\n"
- "uzp2 z25.s, z25.s, z16.s\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ldr x20, [x11, #0x78]\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x27, x8]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n"
- ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z19.s, z20.s, z18.s\n"
+ ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
+ "uzp2 z22.s, z20.s, z18.s\n"
+ "ld1w { z20.s }, p2/Z, [x28]\n"
+ ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
+ "ld1sb { z31.h }, p3/Z, [x26, x8]\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
+ "whilelt p0.h, x10, x17\n"
+ "inch x16\n"
+ ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
- "ld1sb { z29.h }, p3/Z, [x24, x15]\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z28.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z28.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x8]\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n"
- "ld1sb { z31.h }, p3/Z, [x22, x15]\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n"
+ "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "addvl x28, x28, #2\n"
+ ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n"
- "ld1sb { z30.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
- ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
+ "uzp1 z1.s, z20.s, z18.s\n"
+ ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
+ "uzp2 z27.s, z20.s, z18.s\n"
+ ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
+ "ld1sb { z30.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n"
- "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n"
- "ld1sb { z28.h }, p3/Z, [x19, x15]\n"
- "inch x15\n"
- ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
- "whilelt p2.s, x15, x17\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- "mov x19, x15\n"
+ ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n"
- "incw x19\n"
- ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n"
- "whilelt p1.s, x19, x17\n"
- ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n"
- "whilelt p3.h, x15, x17\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
- ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
- "and z16.d, z11.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z1.d, z17.d, z25.d\n"
- "and z27.d, z9.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n"
- ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n"
- "asr z27.s, z27.s, #0x1f\n"
- ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n"
- "sqadd z11.s, z11.s, z16.s\n"
+ ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "inch x8\n"
+ ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z16.d, z20.d, z25.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z17.s, z17.s, z1.s\n"
- "sqadd z9.s, z9.s, z27.s\n"
- ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n"
+ "and z21.d, z13.d, z1.d\n"
+ "mov x20, x8\n"
+ ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ "incw x20\n"
+ ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "sqadd z20.s, z20.s, z16.s\n"
- ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n"
- ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n"
- ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
- ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n"
- "and z16.d, z24.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z7.d, z19.d, z25.d\n"
- "and z3.d, z26.d, z21.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
- "add z11.s, z11.s, z15.s\n"
- "add z17.s, z17.s, z15.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "add z9.s, z9.s, z15.s\n"
- "sqadd z26.s, z26.s, z3.s\n"
- "and z16.d, z23.d, z25.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "smin z11.s, p4/M, z11.s, z14.s\n"
- "smin z17.s, p4/M, z17.s, z14.s\n"
- "smin z9.s, p4/M, z9.s, z14.s\n"
- ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n"
- ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n"
- "smax z11.s, p4/M, z11.s, z13.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "add z20.s, z20.s, z15.s\n"
- "add z24.s, z24.s, z15.s\n"
- "smax z17.s, p4/M, z17.s, z13.s\n"
- "smax z9.s, p4/M, z9.s, z13.s\n"
- "smin z20.s, p4/M, z20.s, z14.s\n"
- "smin z24.s, p4/M, z24.s, z14.s\n"
- "trn1 z11.h, z11.h, z17.h\n"
- "st1b { z11.h }, p0, [x10, x14]\n"
- "smax z20.s, p4/M, z20.s, z13.s\n"
- ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n"
- "smax z24.s, p4/M, z24.s, z13.s\n"
- ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n"
- ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n"
- "trn1 z9.h, z9.h, z20.h\n"
- "st1b { z9.h }, p0, [x9, x14]\n"
- "add z19.s, z19.s, z15.s\n"
- "add z26.s, z26.s, z15.s\n"
- "add z23.s, z23.s, z15.s\n"
- "smin z19.s, p4/M, z19.s, z14.s\n"
- "smin z26.s, p4/M, z26.s, z14.s\n"
- "smin z23.s, p4/M, z23.s, z14.s\n"
- "smax z19.s, p4/M, z19.s, z13.s\n"
- "smax z26.s, p4/M, z26.s, z13.s\n"
- "smax z23.s, p4/M, z23.s, z13.s\n"
- "trn1 z24.h, z24.h, z19.h\n"
- "st1b { z24.h }, p0, [x28, x14]\n"
- "trn1 z26.h, z26.h, z23.h\n"
- "st1b { z26.h }, p0, [x27, x14]\n"
- "inch x14\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z17.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z17.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "mov z9.d, z11.d\n"
+ "and z20.d, z17.d, z27.d\n"
+ "whilelt p1.s, x20, x17\n"
+ ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
+ "sqadd z13.s, z13.s, z21.s\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z26.d, z1.d\n"
+ ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
+ "and z18.d, z24.d, z1.d\n"
+ ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
+ "and z21.d, z25.d, z1.d\n"
+ ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z10.d, z27.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z22.d, z16.d, z27.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z20.d, z9.d, z27.d\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
+ "sqadd z25.s, z25.s, z21.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ "sqadd z10.s, z10.s, z2.s\n"
+ "sqadd z16.s, z16.s, z22.s\n"
+ ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z9.s, z9.s, z20.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
+ ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
+ ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
+ ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
+ ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z12.h\n"
+ "smin z13.h, p4/M, z13.h, z11.h\n"
+ "sqadd z26.h, z26.h, z14.h\n"
+ "sqadd z24.h, z24.h, z14.h\n"
+ "smax z26.h, p4/M, z26.h, z12.h\n"
+ "smax z24.h, p4/M, z24.h, z12.h\n"
+ "sqadd z25.h, z25.h, z14.h\n"
+ "smax z25.h, p4/M, z25.h, z12.h\n"
+ "smin z26.h, p4/M, z26.h, z11.h\n"
+ "st1b { z13.h }, p0, [x15, x10]\n"
+ "smin z24.h, p4/M, z24.h, z11.h\n"
+ "smin z25.h, p4/M, z25.h, z11.h\n"
+ "st1b { z26.h }, p0, [x14, x10]\n"
+ "st1b { z24.h }, p0, [x13, x10]\n"
+ "st1b { z25.h }, p0, [x12, x10]\n"
"ld1sb { z0.h }, p4/Z, [x16]\n"
- ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n"
- "mov z20.d, z17.d\n"
"ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "mov z24.d, z11.d\n"
+ "inch x10\n"
"ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n"
- "mov z19.d, z17.d\n"
"ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- "mov z26.d, z11.d\n"
+ ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
"ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n"
- "mov z23.d, z17.d\n"
"ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n"
+ ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
"ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
"ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n"
"inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
"ld1sb { z8.h }, p4/Z, [x16]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n"
- ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n"
- ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n"
- "ldr x19, [x12, #0x20]\n"
- "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
- "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
- "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
- "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z26.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ld1sb { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1sb { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z24.d, z13.d\n"
+ "mov z16.d, z17.d\n"
+ "ld1sb { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
+ "mov z25.d, z13.d\n"
+ "mov z9.d, z17.d\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
+ ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index ba8c1fdb8d..357c9f8399 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,356 +100,348 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ptrue p4.b\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
"mov x7, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x8, #0x0\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z19.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
"ld1rb { z12.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z20.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x7, x5\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x7, x5\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x19, x7\n"
- "incw x19\n"
- "ldp x12, x11, [x21, #0x10]\n"
- "whilelt p1.s, x19, x5\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z18.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z18.s, z16.s\n"
- "mov z11.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x6]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z16.h }, p4/Z, [x21]\n"
+ "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- "mov z9.d, z16.d\n"
- "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
- "mov z18.d, z13.d\n"
- "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "mov z10.d, z16.d\n"
- "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
- "mov z22.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- "mov z23.d, z16.d\n"
- "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "inch x6, ALL, MUL #8\n"
- "ld1sb { z8.h }, p4/Z, [x6]\n"
- "ldp x26, x25, [x16, #0x0]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z9.d, z13.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z11.d, z13.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z22.d, z17.d\n"
+ "mov z21.d, z13.d\n"
+ "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z18.d, z17.d\n"
".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ldp x24, x23, [x16, #0x10]\n"
".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
- ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n"
- "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n"
- "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
- "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
- "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
- "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
- ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
- ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
- ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
- ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
"1:" // Loop
".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- "ldr x23, [x16, #0x40]\n"
- "whilelt p0.h, x8, x5\n"
- ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n"
- "ldr x22, [x16, #0x48]\n"
- "inch x6\n"
- ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
- "ldr x21, [x16, #0x50]\n"
- ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n"
- "ldr x20, [x16, #0x58]\n"
- ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n"
- "ldr x19, [x16, #0x60]\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- "ldr x10, [x16, #0x68]\n"
- ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n"
- "ldr x9, [x16, #0x70]\n"
- ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
- "ldr x28, [x16, #0x78]\n"
+ ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
+ "ldr x25, [x11, #0x40]\n"
+ "ldr x24, [x11, #0x48]\n"
+ ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
+ "ldr x22, [x11, #0x50]\n"
+ "ldr x20, [x11, #0x58]\n"
".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- "ldr x27, [x16, #0x80]\n"
- ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n"
- "ldr x26, [x16, #0x88]\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
- "ldr x25, [x16, #0x90]\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ldr x23, [x11, #0x78]\n"
+ "ldr x21, [x11, #0x60]\n"
+ ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- "ldr x24, [x16, #0x98]\n"
- ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n"
- "ld1sb { z29.h }, p3/Z, [x23, x7]\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
- "ldr x23, [x16, #0xa0]\n"
- ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- "ldr x22, [x16, #0xa8]\n"
- ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
"ld1sb { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
+ "ldr x22, [x11, #0x80]\n"
+ "ldr x20, [x11, #0x68]\n"
".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- "ldr x21, [x16, #0xb0]\n"
- ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x19, x7]\n"
- ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
+ "ldr x21, [x11, #0x88]\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x16, #0xb8]\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- "ldr x19, [x16, #0xc0]\n"
- ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
- "ld1w { z21.s }, p2/Z, [x17]\n"
- ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n"
- "ld1sb { z24.h }, p3/Z, [x9, x7]\n"
- ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z29.h }, p3/Z, [x10, x7]\n"
- "addvl x17, x17, #2\n"
+ ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
+ "ldr x20, [x11, #0x70]\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
+ "ldr x25, [x11, #0x98]\n"
+ "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- "uzp1 z30.s, z21.s, z17.s\n"
- "uzp2 z31.s, z21.s, z17.s\n"
- "ld1w { z21.s }, p2/Z, [x15]\n"
- ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
- "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
- "addvl x15, x15, #2\n"
- ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
- "ld1sb { z28.h }, p3/Z, [x27, x7]\n"
- ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
- ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n"
- ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "ld1sb { z27.h }, p3/Z, [x28, x7]\n"
- ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
- ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n"
- ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n"
- "ld1sb { z26.h }, p3/Z, [x26, x7]\n"
- ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n"
- ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
- "uzp1 z0.s, z21.s, z17.s\n"
- "uzp2 z21.s, z21.s, z17.s\n"
- ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n"
- ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
- ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n"
- ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n"
- ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ "ldr x24, [x11, #0x90]\n"
+ ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
"ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
- "and z4.d, z13.d, z0.d\n"
- "and z17.d, z16.d, z21.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n"
- ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n"
- "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
- "sqadd z13.s, z13.s, z4.s\n"
- "sqadd z16.s, z16.s, z17.s\n"
- ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n"
- ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
+ "ldr x23, [x11, #0xa8]\n"
+ "ldr x20, [x11, #0xa0]\n"
+ ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
"ld1sb { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
- ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n"
- "ld1sb { z29.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
+ "ldr x22, [x11, #0xb0]\n"
+ "ldr x21, [x11, #0xb8]\n"
+ ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z31.s }, p2/Z, [x27]\n"
+ ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
+ "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "uzp1 z19.s, z31.s, z20.s\n"
+ ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
+ "uzp2 z30.s, z31.s, z20.s\n"
+ "ld1w { z31.s }, p2/Z, [x26]\n"
+ ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
+ ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
+ "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
+ ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
+ "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z1.s, z31.s, z20.s\n"
+ ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
+ ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
+ "uzp2 z31.s, z31.s, z20.s\n"
+ "inch x17\n"
+ ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
+ "and z0.d, z13.d, z1.d\n"
"inch x7\n"
- ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n"
- "whilelt p2.s, x7, x5\n"
- ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
- "mov x19, x7\n"
- ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n"
- "incw x19\n"
- ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n"
- "whilelt p1.s, x19, x5\n"
- "and z1.d, z11.d, z0.d\n"
- "whilelt p3.h, x7, x5\n"
- "and z17.d, z9.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n"
- ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n"
- ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
- ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n"
- "sqadd z11.s, z11.s, z1.s\n"
- "sqadd z9.s, z9.s, z17.s\n"
- "add z13.s, z13.s, z14.s\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n"
- ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n"
- ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n"
- "and z17.d, z18.d, z0.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n"
- ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n"
- "smin z13.s, p4/M, z13.s, z15.s\n"
- ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
- "and z1.d, z10.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- "add z16.s, z16.s, z14.s\n"
- "sqadd z18.s, z18.s, z17.s\n"
+ ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ "mov x20, x7\n"
+ ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
+ "and z20.d, z17.d, z31.d\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "whilelt p3.h, x7, x8\n"
+ "sqadd z13.s, z13.s, z0.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "addvl x27, x27, #2\n"
+ "and z19.d, z9.d, z1.d\n"
+ ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
+ "addvl x26, x26, #2\n"
+ "and z2.d, z11.d, z1.d\n"
".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
- "smax z13.s, p4/M, z13.s, z20.s\n"
- "smin z16.s, p4/M, z16.s, z15.s\n"
- "sqadd z10.s, z10.s, z1.s\n"
- "and z2.d, z22.d, z0.d\n"
+ "and z0.d, z21.d, z1.d\n"
+ ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
+ "and z3.d, z10.d, z31.d\n"
"asr z2.s, z2.s, #0x1f\n"
- ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
- "smax z16.s, p4/M, z16.s, z20.s\n"
- ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
- ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
- ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n"
- "trn1 z13.h, z13.h, z16.h\n"
- "st1b { z13.h }, p0, [x14, x8]\n"
- "add z11.s, z11.s, z14.s\n"
- "add z9.s, z9.s, z14.s\n"
- "add z18.s, z18.s, z14.s\n"
- "sqadd z22.s, z22.s, z2.s\n"
- "and z16.d, z23.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "smin z11.s, p4/M, z11.s, z15.s\n"
- "smin z9.s, p4/M, z9.s, z15.s\n"
- "smin z18.s, p4/M, z18.s, z15.s\n"
- ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n"
- ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n"
- "smax z11.s, p4/M, z11.s, z20.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "add z10.s, z10.s, z14.s\n"
- "add z22.s, z22.s, z14.s\n"
- "smax z9.s, p4/M, z9.s, z20.s\n"
- "smax z18.s, p4/M, z18.s, z20.s\n"
- "smin z10.s, p4/M, z10.s, z15.s\n"
- "smin z22.s, p4/M, z22.s, z15.s\n"
- "trn1 z11.h, z11.h, z9.h\n"
- "st1b { z11.h }, p0, [x13, x8]\n"
- "smax z10.s, p4/M, z10.s, z20.s\n"
- ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n"
- "smax z22.s, p4/M, z22.s, z20.s\n"
- "trn1 z18.h, z18.h, z10.h\n"
- "st1b { z18.h }, p0, [x12, x8]\n"
- "add z23.s, z23.s, z14.s\n"
- "smin z23.s, p4/M, z23.s, z15.s\n"
- "smax z23.s, p4/M, z23.s, z20.s\n"
- "trn1 z22.h, z22.h, z23.h\n"
- "st1b { z22.h }, p0, [x11, x8]\n"
- "inch x8\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z18.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z18.s, z16.s\n"
- "mov z11.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x6]\n"
+ "and z26.d, z22.d, z31.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z20.d, z18.d, z31.d\n"
+ "sqadd z9.s, z9.s, z19.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z2.s\n"
+ ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z0.s\n"
+ ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
+ "sqadd z22.s, z22.s, z26.s\n"
+ "sqadd z18.s, z18.s, z20.s\n"
+ ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
+ ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
+ ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
+ ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "sqadd z9.h, z9.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z16.h\n"
+ "smax z9.h, p4/M, z9.h, z16.h\n"
+ "sqadd z11.h, z11.h, z14.h\n"
+ "sqadd z21.h, z21.h, z14.h\n"
+ "smax z11.h, p4/M, z11.h, z16.h\n"
+ "smax z21.h, p4/M, z21.h, z16.h\n"
+ "smin z13.h, p4/M, z13.h, z15.h\n"
+ "smin z9.h, p4/M, z9.h, z15.h\n"
+ "st1b { z13.h }, p0, [x16, x10]\n"
+ "smin z11.h, p4/M, z11.h, z15.h\n"
+ "smin z21.h, p4/M, z21.h, z15.h\n"
+ "st1b { z9.h }, p0, [x15, x10]\n"
+ "st1b { z11.h }, p0, [x14, x10]\n"
+ "st1b { z21.h }, p0, [x13, x10]\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- "mov z9.d, z16.d\n"
- "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
- "mov z18.d, z13.d\n"
- "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "mov z10.d, z16.d\n"
- "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
- "mov z22.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- "mov z23.d, z16.d\n"
- "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "inch x6, ALL, MUL #8\n"
- "ld1sb { z8.h }, p4/Z, [x6]\n"
- "ldp x26, x25, [x16, #0x0]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z9.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z11.d, z13.d\n"
+ "mov z22.d, z17.d\n"
+ "ld1sb { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z21.d, z13.d\n"
+ "mov z18.d, z17.d\n"
+ "ld1sb { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ldp x24, x23, [x16, #0x10]\n"
+ "ld1sb { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
- ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n"
- "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n"
- "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
- "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
- "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
- "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
- "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
- ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
- ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
- ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
- ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ "ld1sb { z24.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455713ff // ssublb z31.h, z31.b, z23.b\n"
+ ".inst 0x455713de // ssublb z30.h, z30.b, z23.b\n"
+ ".inst 0x455713bd // ssublb z29.h, z29.b, z23.b\n"
+ ".inst 0x4557139c // ssublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4557137b // ssublb z27.h, z27.b, z23.b\n"
+ ".inst 0x4557135a // ssublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571339 // ssublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571318 // ssublb z24.h, z24.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 4733c89199..d8f4d8d199 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,546 +111,538 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x0, #0x0\n"
+ "mov x24, x0\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
- "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x2, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x3, #0x0\n"
- "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z17.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z13.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z5.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x2, x0\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x2, x0\n"
- "ldp x7, x8, [x21, #0x0]\n"
- "mov x19, x2\n"
- "incw x19\n"
- "ldp x17, x16, [x21, #0x10]\n"
- "whilelt p1.s, x19, x0\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z19.s }, p2/Z, [x19]\n"
- "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z19.s, z6.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z19.s, z6.s\n"
- "mov z19.d, z11.d\n"
- "ld1sb { z0.h }, p4/Z, [x1]\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
- "mov z9.d, z16.d\n"
- "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z7.d, z11.d\n"
- "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- "mov z6.d, z16.d\n"
- "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z12.d, z11.d\n"
- "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- "mov z8.d, z16.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
- "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
- "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
- "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
- "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
- "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
- "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
- "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
- "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
- ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z15.b }, p4/Z, [x21]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x3, x4, [x22, #0x0]\n"
+ "whilelt p3.h, x0, x1\n"
+ "ldp x5, x6, [x22, #0x10]\n"
+ "whilelt p2.s, x0, x1\n"
+ "whilelt p1.s, x24, x1\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x7, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1sb { z0.h }, p4/Z, [x2]\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "mov x8, #0x0\n"
+ "mov z20.d, z14.d\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z7.d, z10.d\n"
+ "mov z8.d, z14.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z16.d, z10.d\n"
+ "mov z6.d, z14.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z5.d, z10.d\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
"1:" // Loop
- ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x5, #0x50]\n"
- "whilelt p0.h, x3, x0\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- "ldr x19, [x5, #0x58]\n"
- ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
- "ldr x25, [x5, #0x60]\n"
- ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
- ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n"
- "ldr x24, [x5, #0x68]\n"
- ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n"
- "ldr x23, [x5, #0x70]\n"
- ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n"
- "ldr x22, [x5, #0x78]\n"
- ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
- ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
- "ldr x15, [x5, #0x80]\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
- ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n"
- "ldr x21, [x5, #0x88]\n"
- ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n"
- "ldr x20, [x5, #0x90]\n"
- ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n"
- "ldr x19, [x5, #0x98]\n"
- ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n"
- "ldr x14, [x5, #0xa0]\n"
- ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n"
- "ldr x13, [x5, #0xa8]\n"
- ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
- "ldr x12, [x5, #0xb0]\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x25, x2]\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
- "ldr x11, [x5, #0xb8]\n"
- ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
- "ldr x10, [x5, #0xc0]\n"
- ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n"
- "ldr x9, [x5, #0xc8]\n"
- ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n"
- "ldr x28, [x5, #0xd0]\n"
- ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n"
- "ldr x27, [x5, #0xd8]\n"
- ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
- "ldr x26, [x5, #0xe0]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
- "ldr x25, [x5, #0xe8]\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
- "ld1w { z18.s }, p2/Z, [x4]\n"
- ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
- "addvl x4, x4, #2\n"
- ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- "uzp1 z21.s, z18.s, z20.s\n"
- "uzp2 z10.s, z18.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x6]\n"
- ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
- "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
- "addvl x6, x6, #2\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n"
- "ldr x24, [x5, #0xf0]\n"
- ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n"
- "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n"
- "ldr x23, [x5, #0xf8]\n"
- ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x7, #0x50]\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x7, #0x58]\n"
+ ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
+ ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
+ "ldr x21, [x7, #0x60]\n"
+ "ldr x20, [x7, #0x68]\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z30.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "uzp1 z29.s, z18.s, z20.s\n"
- "uzp2 z20.s, z18.s, z20.s\n"
- ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
- ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n"
- ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
+ "ldr x25, [x7, #0x70]\n"
+ "ldr x24, [x7, #0x78]\n"
+ ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
+ "ldr x15, [x7, #0x80]\n"
+ "ldr x23, [x7, #0x88]\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
+ "ld1sb { z27.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- "ld1sb { z28.h }, p3/Z, [x21, x2]\n"
- ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
- ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n"
- "ldr x22, [x5, #0x100]\n"
- ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n"
- ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ "ldr x22, [x7, #0x90]\n"
+ "ldr x21, [x7, #0x98]\n"
+ ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
+ "ldr x14, [x7, #0xa0]\n"
+ "ldr x13, [x7, #0xa8]\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- "ld1sb { z23.h }, p3/Z, [x15, x2]\n"
- ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
- ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n"
- "ldr x21, [x5, #0x108]\n"
- ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n"
- ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ "ldr x12, [x7, #0xb0]\n"
+ "ldr x20, [x7, #0xb8]\n"
+ ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ "ldr x11, [x7, #0xc0]\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z24.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
- ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n"
- "ldr x20, [x5, #0x110]\n"
- ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
+ "ldr x10, [x7, #0xc8]\n"
+ "ldr x9, [x7, #0xd0]\n"
+ ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ "ldr x28, [x7, #0xd8]\n"
+ "ldr x27, [x7, #0xe0]\n"
+ ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z27.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
- ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
- "ldr x19, [x5, #0x118]\n"
- ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z26.h }, p3/Z, [x14, x2]\n"
- ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
- ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
+ "ldr x26, [x7, #0xe8]\n"
+ "ldr x25, [x7, #0xf0]\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
+ "ld1w { z19.s }, p2/Z, [x17]\n"
+ "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z28.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- "ld1sb { z22.h }, p3/Z, [x11, x2]\n"
- ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
- ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
+ "ldr x24, [x7, #0xf8]\n"
+ "uzp1 z9.s, z19.s, z18.s\n"
+ ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
+ "uzp2 z29.s, z19.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
+ "ld1sb { z23.h }, p3/Z, [x15, x0]\n"
+ ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- "ld1sb { z25.h }, p3/Z, [x13, x2]\n"
- ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
+ "ldr x23, [x7, #0x100]\n"
+ "whilelt p0.h, x8, x1\n"
+ ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
+ "addvl x17, x17, #2\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
+ "ld1sb { z31.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- "ld1sb { z24.h }, p3/Z, [x12, x2]\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n"
- ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
+ "ldr x22, [x7, #0x108]\n"
+ ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
+ "ld1sb { z30.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- "ld1sb { z27.h }, p3/Z, [x10, x2]\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
- ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
+ "ldr x21, [x7, #0x110]\n"
+ ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z26.h }, p3/Z, [x14, x0]\n"
+ ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- "ld1sb { z23.h }, p3/Z, [x9, x2]\n"
- ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
- ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z28.h }, p3/Z, [x26, x2]\n"
- ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
- ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
+ ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
+ "ldr x20, [x7, #0x118]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z25.h }, p3/Z, [x13, x0]\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
- ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
- ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z24.h }, p3/Z, [x12, x0]\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
- ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
- ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n"
- ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x11, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- "ld1sb { z26.h }, p3/Z, [x25, x2]\n"
- ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
- ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
+ "ld1sb { z23.h }, p3/Z, [x10, x0]\n"
+ ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n"
- ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
+ ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n"
- ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1]\n"
- "inch x1\n"
- ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x27, x0]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z25.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n"
- ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
+ "ld1sb { z26.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z24.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n"
- ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z25.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z27.h }, p3/Z, [x19, x2]\n"
- "inch x2\n"
- ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
- "whilelt p2.s, x2, x0\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "mov x19, x2\n"
- ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z24.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- "incw x19\n"
- ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n"
- "whilelt p1.s, x19, x0\n"
- ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n"
- "whilelt p3.h, x2, x0\n"
- ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
- ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n"
- ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n"
- ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n"
- ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
- "and z31.d, z11.d, z29.d\n"
- "asr z31.s, z31.s, #0x1f\n"
- "and z23.d, z16.d, z20.d\n"
- "and z25.d, z19.d, z29.d\n"
- "asr z23.s, z23.s, #0x1f\n"
- "and z18.d, z9.d, z20.d\n"
- ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
- "asr z25.s, z25.s, #0x1f\n"
- ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
+ ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
+ "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
+ "addvl x16, x16, #2\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ "inch x2\n"
+ ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z25.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
+ "uzp1 z23.s, z19.s, z18.s\n"
+ ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
+ "uzp2 z22.s, z19.s, z18.s\n"
+ ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z24.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
+ ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
+ "inch x0\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
+ "and z21.d, z14.d, z23.d\n"
+ "mov x20, x0\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
+ "whilelt p2.s, x0, x1\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ "and z3.d, z10.d, z22.d\n"
+ "whilelt p1.s, x20, x1\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
+ "whilelt p3.h, x0, x1\n"
+ ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sqadd z14.s, z14.s, z21.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
+ "and z19.d, z20.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ "and z18.d, z8.d, z23.d\n"
+ ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
+ "and z21.d, z6.d, z23.d\n"
+ ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
+ "and z1.d, z7.d, z22.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z11.s, z11.s, z31.s\n"
- ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n"
- "sqadd z16.s, z16.s, z23.s\n"
- "sqadd z19.s, z19.s, z25.s\n"
- ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
- "sqadd z9.s, z9.s, z18.s\n"
- "and z1.d, z7.d, z29.d\n"
+ "and z2.d, z16.d, z22.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z3.d, z5.d, z22.d\n"
+ "sqadd z20.s, z20.s, z19.s\n"
+ ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
"asr z1.s, z1.s, #0x1f\n"
- "and z18.d, z6.d, z20.d\n"
- ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n"
- "and z30.d, z12.d, z29.d\n"
- "asr z30.s, z30.s, #0x1f\n"
- "add z11.s, z11.s, z14.s\n"
- "sqadd z7.s, z7.s, z1.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
- "smin z11.s, p4/M, z11.s, z15.s\n"
- ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n"
- "sqadd z12.s, z12.s, z30.s\n"
- "and z3.d, z8.d, z20.d\n"
+ "sqadd z8.s, z8.s, z18.s\n"
+ ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z21.s\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
"asr z3.s, z3.s, #0x1f\n"
- "add z16.s, z16.s, z14.s\n"
- "smax z11.s, p4/M, z11.s, z5.s\n"
- ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n"
- ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n"
- "smin z16.s, p4/M, z16.s, z15.s\n"
- ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
- "add z19.s, z19.s, z14.s\n"
- "add z9.s, z9.s, z14.s\n"
- "sqadd z8.s, z8.s, z3.s\n"
- "add z7.s, z7.s, z14.s\n"
- "smax z16.s, p4/M, z16.s, z5.s\n"
- "smin z19.s, p4/M, z19.s, z15.s\n"
- "smin z9.s, p4/M, z9.s, z15.s\n"
- "smin z7.s, p4/M, z7.s, z15.s\n"
- "trn1 z11.h, z11.h, z16.h\n"
- "st1b { z11.h }, p0, [x7, x3]\n"
- "smax z19.s, p4/M, z19.s, z5.s\n"
- "smax z9.s, p4/M, z9.s, z5.s\n"
- "smax z7.s, p4/M, z7.s, z5.s\n"
- ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n"
- ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n"
- "trn1 z19.h, z19.h, z9.h\n"
- "st1b { z19.h }, p0, [x8, x3]\n"
- "add z6.s, z6.s, z14.s\n"
- ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n"
- "add z12.s, z12.s, z14.s\n"
- "smin z6.s, p4/M, z6.s, z15.s\n"
- "add z8.s, z8.s, z14.s\n"
- "smin z12.s, p4/M, z12.s, z15.s\n"
- "smax z6.s, p4/M, z6.s, z5.s\n"
- "smin z8.s, p4/M, z8.s, z15.s\n"
- "smax z12.s, p4/M, z12.s, z5.s\n"
- "trn1 z7.h, z7.h, z6.h\n"
- "st1b { z7.h }, p0, [x17, x3]\n"
- "smax z8.s, p4/M, z8.s, z5.s\n"
- "trn1 z12.h, z12.h, z8.h\n"
- "st1b { z12.h }, p0, [x16, x3]\n"
- "inch x3\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z19.s }, p2/Z, [x19]\n"
- "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z19.s, z6.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z19.s, z6.s\n"
- "mov z19.d, z11.d\n"
- "ld1sb { z0.h }, p4/Z, [x1]\n"
- ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
- "mov z9.d, z16.d\n"
- "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z7.d, z11.d\n"
- "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
- "mov z6.d, z16.d\n"
- "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z12.d, z11.d\n"
- "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- "mov z8.d, z16.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
- "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
- "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
- "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
- "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
- "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
- "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
- "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
- "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
- "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
- ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
- ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
- ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ "sqadd z7.s, z7.s, z1.s\n"
+ ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z5.s, z5.s, z3.s\n"
+ ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
+ ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
+ ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
+ ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
+ ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
+ "sqadd z14.h, z14.h, z12.h\n"
+ "sqadd z20.h, z20.h, z12.h\n"
+ "smax z14.h, p4/M, z14.h, z13.h\n"
+ "smax z20.h, p4/M, z20.h, z13.h\n"
+ "sqadd z8.h, z8.h, z12.h\n"
+ "sqadd z6.h, z6.h, z12.h\n"
+ "smax z8.h, p4/M, z8.h, z13.h\n"
+ "smax z6.h, p4/M, z6.h, z13.h\n"
+ "smin z14.h, p4/M, z14.h, z11.h\n"
+ "smin z20.h, p4/M, z20.h, z11.h\n"
+ "st1b { z14.h }, p0, [x3, x8]\n"
+ "smin z8.h, p4/M, z8.h, z11.h\n"
+ "smin z6.h, p4/M, z6.h, z11.h\n"
+ "st1b { z20.h }, p0, [x4, x8]\n"
+ "st1b { z8.h }, p0, [x5, x8]\n"
+ "st1b { z6.h }, p0, [x6, x8]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1sb { z0.h }, p4/Z, [x2]\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "inch x8\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z20.d, z14.d\n"
+ "mov z7.d, z10.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z8.d, z14.d\n"
+ "mov z16.d, z10.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z6.d, z14.d\n"
+ "mov z5.d, z10.d\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1sb { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ "ld1sb { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1sb { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ "ld1sb { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1sb { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x454f13ff // ssublb z31.h, z31.b, z15.b\n"
+ "ld1sb { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1sb { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ ".inst 0x454f13bd // ssublb z29.h, z29.b, z15.b\n"
+ "ld1sb { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f137b // ssublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f12f7 // ssublb z23.h, z23.b, z15.b\n"
+ ".inst 0x454f1339 // ssublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1318 // ssublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f135a // ssublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f12d6 // ssublb z22.h, z22.b, z15.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index ea7acf5b6e..6fba4d47d2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,309 +41,295 @@ void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov z31.s, #0x0\n"
- "ldr x24, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "mov z18.s, #0x0\n"
+ "mov x20, #0x9\n"
+ "whilelt p0.b, XZR, x20\n"
"ldr x23, [%x[inptrs], #0x8]\n"
- "lsl x9, %x[n_channels], #0x2\n"
- "mov z29.s, #0x0\n"
- "ldr x22, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "mov z28.s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "mov x19, #0x9\n"
- "mov z13.s, #0x0\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "whilelt p1.b, XZR, x19\n"
- "mov z14.s, #0x0\n"
- "ld1b { z7.b }, p1/Z, [x24]\n"
- "mov x19, #0x3\n"
- "mov z15.s, #0x0\n"
- "ld1b { z3.b }, p1/Z, [x23]\n"
- "whilelt p0.b, XZR, x19\n"
- "mov z11.b, p0/z, #0x1\n"
- "ld1b { z4.b }, p1/Z, [x22]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x20]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov z15.b, #0x1\n"
+ "lsr z15.s, z15.s, #0x8\n"
+ "ld1b { z1.b }, p0/Z, [x23]\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z30.d, z1.d\n"
+ "mov z29.d, z1.d\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z28.d, z1.d\n"
+ "mov z27.d, z2.d\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z26.d, z2.d\n"
+ "mov z25.d, z2.d\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z24.d, z4.d\n"
+ "mov z23.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z22.d, z4.d\n"
+ "ext z30.b, z30.b, z30.b, #0x2\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z14.s, p2/M, z14.s\n"
+ "ext z29.b, z29.b, z29.b, #0x4\n"
+ "ext z28.b, z28.b, z28.b, #0x6\n"
+ "mov x9, #0x0\n"
+ "whilelt p1.b, x9, x10\n"
+ "ext z27.b, z27.b, z27.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x4\n"
+ "ld1w { z13.s }, p1/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "mov z10.d, z7.d\n"
- "ld1b { z6.b }, p1/Z, [x21]\n"
- "mov x27, #0x0\n"
- "ext z10.b, z10.b, z10.b, #0x2\n"
- "ld1b { z5.b }, p1/Z, [x20]\n"
- "whilelt p1.b, x28, x9\n"
- "mov z17.d, z7.d\n"
- "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z26.d, z7.d\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "mov z19.d, z3.d\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "ext z19.b, z19.b, z19.b, #0x2\n"
+ "ext z25.b, z25.b, z25.b, #0x6\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z23.b, z23.b, z23.b, #0x4\n"
+ "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "mov z21.d, z0.d\n"
+ "mov z20.d, z0.d\n"
"ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z7.s, z7.s, z17.s\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "zip1 z10.s, z10.s, z26.s\n"
- "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "zip1 z7.s, z7.s, z10.s\n"
- "ld1w { z1.s }, p1/Z, [%x[params]]\n"
- "mov z7.q, z7.q[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z19.d, z0.d\n"
+ "mov z18.d, z3.d\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z16.d, z3.d\n"
+ "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
- "mov z2.d, z3.d\n"
- "mov z20.d, z4.d\n"
- "ext z2.b, z2.b, z2.b, #0x6\n"
- "zip1 z3.s, z3.s, z17.s\n"
- "ext z20.b, z20.b, z20.b, #0x2\n"
- "mov z17.d, z4.d\n"
- "zip1 z19.s, z19.s, z2.s\n"
- "zip1 z3.s, z3.s, z19.s\n"
- "mov z3.q, z3.q[0]\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "zip1 z1.s, z1.s, z29.s\n"
+ "zip1 z30.s, z30.s, z28.s\n"
+ "zip1 z2.s, z2.s, z26.s\n"
+ "zip1 z27.s, z27.s, z25.s\n"
+ "ext z18.b, z18.b, z18.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z26.d, z4.d\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "mov z21.d, z6.d\n"
- "zip1 z4.s, z4.s, z17.s\n"
- "ext z21.b, z21.b, z21.b, #0x2\n"
- "zip1 z20.s, z20.s, z26.s\n"
- "zip1 z4.s, z4.s, z20.s\n"
+ "ext z16.b, z16.b, z16.b, #0x6\n"
+ "zip1 z4.s, z4.s, z23.s\n"
+ "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z0.s, z0.s, z20.s\n"
+ "zip1 z21.s, z21.s, z19.s\n"
+ "zip1 z1.s, z1.s, z30.s\n"
+ "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "zip1 z18.s, z18.s, z16.s\n"
+ "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z0.s, z0.s, z21.s\n"
+ "mov z1.q, z1.q[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z3.s, z3.s, z18.s\n"
"mov z4.q, z4.q[0]\n"
- "mov z17.d, z6.d\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z20.d, z6.d\n"
- "ext z20.b, z20.b, z20.b, #0x6\n"
- "mov z19.d, z5.d\n"
- "zip1 z6.s, z6.s, z17.s\n"
- "ext z19.b, z19.b, z19.b, #0x2\n"
- "zip1 z21.s, z21.s, z20.s\n"
- "zip1 z6.s, z6.s, z21.s\n"
- "mov z6.q, z6.q[0]\n"
- "mov z17.d, z5.d\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z20.d, z5.d\n"
- "ext z20.b, z20.b, z20.b, #0x6\n"
- "mov z11.s, z11.s[0]\n"
- "zip1 z5.s, z5.s, z17.s\n"
- "mov z25.s, #0x0\n"
- "zip1 z19.s, z19.s, z20.s\n"
- "zip1 z5.s, z5.s, z19.s\n"
- "mov z5.q, z5.q[0]\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
"mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "sdot z24.s, z15.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
+ "sdot z25.s, z15.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z17.s, #0x0\n"
"mov z20.s, #0x0\n"
- "mov z2.s, #0x0\n"
+ "sdot z23.s, z15.b, z1.b[2]\n"
+ "mov z9.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "sdot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "sdot z18.s, z11.b, z7.b[1]\n"
- "sdot z29.s, z11.b, z7.b[2]\n"
- "sdot z28.s, z11.b, z7.b[3]\n"
- "sdot z13.s, z11.b, z3.b[0]\n"
- "sdot z14.s, z11.b, z3.b[1]\n"
- "sdot z15.s, z11.b, z3.b[2]\n"
- "sdot z25.s, z11.b, z3.b[3]\n"
- "sdot z26.s, z11.b, z4.b[0]\n"
- "sdot z27.s, z11.b, z4.b[1]\n"
- "sdot z24.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z4.b[3]\n"
- "sdot z22.s, z11.b, z6.b[0]\n"
- "sdot z21.s, z11.b, z6.b[1]\n"
- "sdot z17.s, z11.b, z6.b[2]\n"
- "sdot z20.s, z11.b, z6.b[3]\n"
- "sdot z2.s, z11.b, z5.b[0]\n"
- "sdot z19.s, z11.b, z5.b[1]\n"
- "mov z31.d, z31.d\n"
- "mov z18.d, z18.d\n"
- "mov z29.d, z29.d\n"
- "mov z28.d, z28.d\n"
- "add z31.s, z31.s, z13.s\n"
- "mov z13.s, #0x0\n"
- "sdot z13.s, z11.b, z5.b[2]\n"
- "add z18.s, z18.s, z14.s\n"
- "mov z14.s, #0x0\n"
- "sdot z14.s, z11.b, z5.b[3]\n"
- "add z29.s, z29.s, z15.s\n"
- "add z28.s, z28.s, z25.s\n"
- "add z31.s, z31.s, z26.s\n"
- "add z18.s, z18.s, z27.s\n"
- "add z29.s, z29.s, z24.s\n"
- "add z28.s, z28.s, z23.s\n"
- "mov z26.d, z26.d\n"
- "mov z25.d, z27.d\n"
- "mov z24.d, z24.d\n"
- "mov z23.d, z23.d\n"
- "add z26.s, z26.s, z22.s\n"
- "add z25.s, z25.s, z21.s\n"
- "add z24.s, z24.s, z17.s\n"
- "add z23.s, z23.s, z20.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z25.s, z25.s, z19.s\n"
+ "mov z18.s, #0x0\n"
+ "sdot z21.s, z15.b, z2.b[0]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "sdot z20.s, z15.b, z2.b[1]\n"
+ "sdot z9.s, z15.b, z2.b[2]\n"
+ "sdot z8.s, z15.b, z2.b[3]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z19.s, z15.b, z4.b[0]\n"
+ "sdot z18.s, z15.b, z4.b[1]\n"
+ "mov z3.q, z3.q[0]\n"
+ "sdot z17.s, z15.b, z4.b[2]\n"
+ "sdot z16.s, z15.b, z4.b[3]\n"
+ "mov z31.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "sdot z31.s, z15.b, z0.b[0]\n"
+ "mov z28.s, #0x0\n"
+ "sdot z30.s, z15.b, z0.b[1]\n"
+ "sdot z29.s, z15.b, z0.b[2]\n"
+ "sdot z28.s, z15.b, z0.b[3]\n"
+ "add z24.s, z24.s, z21.s\n"
+ "add z25.s, z25.s, z20.s\n"
+ "add z26.s, z23.s, z9.s\n"
+ "add z27.s, z22.s, z8.s\n"
+ "add z23.s, z19.s, z21.s\n"
+ "mov z22.s, #0x0\n"
+ "sdot z22.s, z15.b, z3.b[0]\n"
+ "add z21.s, z18.s, z20.s\n"
+ "mov z20.s, #0x0\n"
+ "sdot z20.s, z15.b, z3.b[1]\n"
+ "add z19.s, z17.s, z9.s\n"
+ "mov z18.s, #0x0\n"
+ "sdot z18.s, z15.b, z3.b[2]\n"
+ "add z17.s, z16.s, z8.s\n"
+ "mov z16.s, #0x0\n"
+ "sdot z16.s, z15.b, z3.b[3]\n"
+ "add z24.s, z24.s, z31.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "mul z24.s, p2/M, z24.s, z14.s\n"
+ "mul z25.s, p2/M, z25.s, z14.s\n"
+ "add z26.s, z26.s, z29.s\n"
+ "add z27.s, z27.s, z28.s\n"
+ "mul z26.s, p2/M, z26.s, z14.s\n"
+ "mul z27.s, p2/M, z27.s, z14.s\n"
+ "add z28.s, z23.s, z22.s\n"
+ "add z29.s, z21.s, z20.s\n"
+ "mul z28.s, p2/M, z28.s, z14.s\n"
+ "mul z29.s, p2/M, z29.s, z14.s\n"
+ "add z30.s, z19.s, z18.s\n"
+ "add z31.s, z17.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z14.s\n"
+ "mul z31.s, p2/M, z31.s, z14.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
"add z24.s, z24.s, z13.s\n"
- "add z23.s, z23.s, z14.s\n"
- "neg z30.s, p2/M, z30.s\n"
- "mul z31.s, p2/M, z31.s, z30.s\n"
- "st1w { z31.s }, p2, [SP]\n"
- "add z31.s, z31.s, z1.s\n"
- "mul z18.s, p2/M, z18.s, z30.s\n"
- "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
- "add z18.s, z18.s, z1.s\n"
- "mul z29.s, p2/M, z29.s, z30.s\n"
- "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
- "add z29.s, z29.s, z1.s\n"
- "mul z28.s, p2/M, z28.s, z30.s\n"
- "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
- "add z28.s, z28.s, z1.s\n"
- "mul z26.s, p2/M, z26.s, z30.s\n"
- "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
- "add z26.s, z26.s, z1.s\n"
- "mul z25.s, p2/M, z25.s, z30.s\n"
- "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z1.s\n"
- "mul z24.s, p2/M, z24.s, z30.s\n"
- "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
- "add z24.s, z24.s, z1.s\n"
- "mul z23.s, p2/M, z23.s, z30.s\n"
- "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
- "add z23.s, z23.s, z1.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"1:" // Loop
- "sdot z31.s, z8.b, z7.b[0]\n"
- "ld1w { z22.s }, p2/Z, [%x[params]]\n"
- "incb x28\n"
- "sdot z18.s, z8.b, z7.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "whilelt p0.s, x27, %x[n_channels]\n"
- "sdot z29.s, z8.b, z7.b[2]\n"
- "whilelt p1.b, x28, x9\n"
- "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "sdot z28.s, z8.b, z7.b[3]\n"
- "sdot z26.s, z8.b, z4.b[0]\n"
- "sdot z25.s, z8.b, z4.b[1]\n"
- "sdot z24.s, z8.b, z4.b[2]\n"
- "sdot z23.s, z8.b, z4.b[3]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "sdot z31.s, z9.b, z3.b[0]\n"
- "sdot z18.s, z9.b, z3.b[1]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "sdot z28.s, z9.b, z3.b[3]\n"
- "sdot z26.s, z9.b, z6.b[0]\n"
- "sdot z25.s, z9.b, z6.b[1]\n"
- "sdot z24.s, z9.b, z6.b[2]\n"
- "sdot z23.s, z9.b, z6.b[3]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "sdot z31.s, z10.b, z4.b[0]\n"
- "sdot z18.s, z10.b, z4.b[1]\n"
- "sdot z29.s, z10.b, z4.b[2]\n"
- "sdot z28.s, z10.b, z4.b[3]\n"
- "sdot z26.s, z10.b, z5.b[0]\n"
- "sdot z25.s, z10.b, z5.b[1]\n"
- "sdot z24.s, z10.b, z5.b[2]\n"
- "sdot z23.s, z10.b, z5.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "sdot z24.s, z5.b, z0.b[0]\n"
+ "sdot z25.s, z5.b, z0.b[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z26.s, z5.b, z0.b[2]\n"
+ "sdot z27.s, z5.b, z0.b[3]\n"
+ "incb x9\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z24.s, z6.b, z1.b[0]\n"
+ "sdot z25.s, z6.b, z1.b[1]\n"
+ "whilelt p1.b, x9, x10\n"
+ "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z26.s, z6.b, z1.b[2]\n"
+ "sdot z27.s, z6.b, z1.b[3]\n"
+ "sdot z28.s, z5.b, z2.b[0]\n"
+ "sdot z29.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z2.b[2]\n"
+ "sdot z31.s, z5.b, z2.b[3]\n"
+ "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[1]\n"
+ ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ "sdot z30.s, z6.b, z3.b[2]\n"
+ "sdot z31.s, z6.b, z3.b[3]\n"
+ ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
+ "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z28.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z4.b[1]\n"
+ "and z19.d, z24.d, z20.d\n"
+ "sdot z30.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z4.b[3]\n"
+ "and z18.d, z25.d, z20.d\n"
+ "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z20.d\n"
+ "and z16.d, z27.d, z20.d\n"
"addvl %x[params], %x[params], #6\n"
- ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n"
- ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
- ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
- ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
- ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
- "and z20.d, z31.d, z21.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z18.d, z21.d\n"
- "and z14.d, z29.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z17.d, z28.d, z21.d\n"
- "and z2.d, z26.d, z21.d\n"
- "asr z14.s, z14.s, #0x1f\n"
- ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- "sqadd z18.s, z18.s, z19.s\n"
- "sqadd z29.s, z29.s, z14.s\n"
- "and z27.d, z25.d, z21.d\n"
- "asr z27.s, z27.s, #0x1f\n"
- "sqadd z28.s, z28.s, z17.s\n"
- "sqadd z26.s, z26.s, z2.s\n"
- "and z17.d, z24.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
+ ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
+ ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
+ ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
+ ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
+ "and z19.d, z28.d, z20.d\n"
+ "and z18.d, z29.d, z20.d\n"
+ "and z17.d, z30.d, z20.d\n"
+ "and z16.d, z31.d, z20.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "and z15.d, z23.d, z21.d\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "asr z15.s, z15.s, #0x1f\n"
- "sqadd z25.s, z25.s, z27.s\n"
- ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n"
- "add z31.s, z31.s, z12.s\n"
- "sqadd z24.s, z24.s, z17.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
- "add z18.s, z18.s, z12.s\n"
- "sqadd z23.s, z23.s, z15.s\n"
- "smin z31.s, p2/M, z31.s, z0.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "smax z31.s, p2/M, z31.s, z16.s\n"
- "st1b { z31.s }, p0, [x26, x27]\n"
- "add z28.s, z28.s, z12.s\n"
- "smax z18.s, p2/M, z18.s, z16.s\n"
- "ld1w { z31.s }, p2/Z, [SP]\n"
- "smin z29.s, p2/M, z29.s, z0.s\n"
- "st1b { z18.s }, p0, [x25, x27]\n"
- "add z31.s, z31.s, z1.s\n"
- "smin z28.s, p2/M, z28.s, z0.s\n"
- "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
- "smax z29.s, p2/M, z29.s, z16.s\n"
- "st1b { z29.s }, p0, [x24, x27]\n"
- "add z18.s, z18.s, z1.s\n"
- "smax z28.s, p2/M, z28.s, z16.s\n"
- "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "st1b { z28.s }, p0, [x23, x27]\n"
- "add z29.s, z29.s, z1.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
- "add z26.s, z26.s, z12.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
- "add z25.s, z25.s, z12.s\n"
- "add z28.s, z28.s, z1.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
+ ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
+ ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
"add z24.s, z24.s, z12.s\n"
- "add z23.s, z23.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z0.s\n"
- "smin z25.s, p2/M, z25.s, z0.s\n"
- "smin z24.s, p2/M, z24.s, z0.s\n"
- "smin z23.s, p2/M, z23.s, z0.s\n"
- "smax z26.s, p2/M, z26.s, z16.s\n"
- "st1b { z26.s }, p0, [x22, x27]\n"
- "smax z25.s, p2/M, z25.s, z16.s\n"
- "smax z24.s, p2/M, z24.s, z16.s\n"
- "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
- "smax z23.s, p2/M, z23.s, z16.s\n"
- "st1b { z25.s }, p0, [x21, x27]\n"
- "add z26.s, z26.s, z1.s\n"
- "st1b { z24.s }, p0, [x20, x27]\n"
- "st1b { z23.s }, p0, [x19, x27]\n"
- "incw x27\n"
- "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
- "add z24.s, z24.s, z1.s\n"
- "add z23.s, z23.s, z1.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "smin z25.s, p2/M, z25.s, z10.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "smin z26.s, p2/M, z26.s, z10.s\n"
+ "smin z27.s, p2/M, z27.s, z10.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "smin z28.s, p2/M, z28.s, z10.s\n"
+ "smin z29.s, p2/M, z29.s, z10.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "smin z30.s, p2/M, z30.s, z10.s\n"
+ "smin z31.s, p2/M, z31.s, z10.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smax z25.s, p2/M, z25.s, z11.s\n"
+ "st1b { z24.s }, p0, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z11.s\n"
+ "smax z27.s, p2/M, z27.s, z11.s\n"
+ "st1b { z25.s }, p0, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z11.s\n"
+ "smax z29.s, p2/M, z29.s, z11.s\n"
+ "st1b { z26.s }, p0, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z11.s\n"
+ "smax z31.s, p2/M, z31.s, z11.s\n"
+ "st1b { z27.s }, p0, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p0, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "st1b { z29.s }, p0, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "st1b { z30.s }, p0, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "st1b { z31.s }, p0, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 6bc5935348..2ed7cfc815 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,384 +41,358 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov z20.b, #0x1\n"
- "ldr x24, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "mov z22.s, #0x1\n"
- "ldr x23, [%x[inptrs], #0x8]\n"
- "lsl x9, %x[n_channels], #0x2\n"
- "mov z30.s, #0x0\n"
- "ldr x22, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "mov z28.s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
"mov x20, #0x6\n"
- "mov z29.s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x20]\n"
"whilelt p0.b, XZR, x20\n"
- "mov z27.s, #0x0\n"
- "ld1b { z0.b }, p0/Z, [x24]\n"
- "mov x28, #0x0\n"
- "mov z26.s, #0x0\n"
- "ld1b { z3.b }, p0/Z, [x23]\n"
- "mov x27, #0x0\n"
- "mov z25.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22]\n"
- "whilelt p1.b, x28, x9\n"
- "mov z15.d, z0.d\n"
+ "ldr x22, [%x[inptrs], #0x18]\n"
+ "ldr x21, [%x[inptrs], #0x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1b { z3.b }, p0/Z, [x22]\n"
+ "mov z20.d, z3.d\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
- "mov z24.s, #0x0\n"
- "ld1b { z6.b }, p0/Z, [x19]\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "mov z18.d, z4.d\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "ldr x23, [%x[inptrs], #0x28]\n"
+ "mov z15.d, z2.d\n"
"ext z15.b, z15.b, z15.b, #0x1\n"
- "ldr x21, [%x[inptrs], #0x28]\n"
- "mov z16.d, z3.d\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "ldr x19, [%x[inptrs], #0x38]\n"
- "mov z18.d, z5.d\n"
+ "ldr x22, [%x[inptrs], #0x30]\n"
+ "ldr x21, [%x[inptrs], #0x38]\n"
+ "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z4.d, z4.d, z18.d\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "mov z20.d, z1.d\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "ld1b { z5.b }, p0/Z, [x23]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "mov z13.d, z5.d\n"
+ "mov z19.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
- "zip1 z0.d, z0.d, z15.d\n"
- "ld1b { z1.b }, p0/Z, [x20]\n"
- "mov z0.q, z0.q[0]\n"
- "ld1b { z2.b }, p0/Z, [x19]\n"
- "zip1 z3.d, z3.d, z16.d\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1b { z0.b }, p0/Z, [x20]\n"
+ "mov z25.d, z7.d\n"
+ "zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "mov z16.d, z4.d\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z23.s, p2/M, z23.s\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ "mov z30.b, #0x1\n"
+ "mov x9, #0x0\n"
+ "whilelt p1.b, x9, x10\n"
+ "mov z24.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "sdot z24.s, z30.b, z3.b[0]\n"
+ "ld1w { z12.s }, p1/Z, [%x[params]]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "sdot z28.s, z30.b, z3.b[2]\n"
+ "mov x28, #0x0\n"
+ "mov z16.d, z0.d\n"
+ "sdot z18.s, z30.b, z4.b[0]\n"
+ "sdot z17.s, z30.b, z4.b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
"ext z16.b, z16.b, z16.b, #0x1\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "mov z17.d, z6.d\n"
+ "zip1 z1.d, z1.d, z20.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z5.d, z5.d, z13.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
"ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z5.d, z5.d, z18.d\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "mov z5.q, z5.q[0]\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "zip1 z4.d, z4.d, z16.d\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
- "mov z4.q, z4.q[0]\n"
+ "zip1 z6.d, z6.d, z19.d\n"
+ "zip1 z7.d, z7.d, z25.d\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z26.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "sdot z26.s, z30.b, z2.b[0]\n"
"ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z29.s, #0x1\n"
+ "sdot z22.s, z30.b, z2.b[2]\n"
+ "sdot z24.s, z29.b, z3.b[1]\n"
"ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "mov z16.d, z7.d\n"
+ "zip1 z0.d, z0.d, z16.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "sdot z28.s, z29.b, z3.b[3]\n"
"ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "addvl %x[params], %x[params], #5\n"
- "zip1 z6.d, z6.d, z17.d\n"
- "mov z17.d, z1.d\n"
+ "mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "zip1 z7.d, z7.d, z16.d\n"
+ "sdot z18.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
- "mov z16.d, z2.d\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "mov z23.s, #0x0\n"
- "zip1 z1.d, z1.d, z17.d\n"
- "mov z1.q, z1.q[0]\n"
- "zip1 z2.d, z2.d, z16.d\n"
- "mov z2.q, z2.q[0]\n"
- "mov z18.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
"mov z21.s, #0x0\n"
+ "sdot z17.s, z29.b, z4.b[3]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "mov z20.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "sdot z21.s, z30.b, z1.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z19.s, #0x0\n"
- "sdot z30.s, z20.b, z0.b[0]\n"
- "sdot z28.s, z20.b, z0.b[2]\n"
- "sdot z29.s, z20.b, z3.b[0]\n"
- "sdot z27.s, z20.b, z3.b[2]\n"
- "sdot z30.s, z22.b, z0.b[1]\n"
- "sdot z28.s, z22.b, z0.b[3]\n"
- "sdot z29.s, z22.b, z3.b[1]\n"
- "sdot z27.s, z22.b, z3.b[3]\n"
- "sdot z26.s, z20.b, z5.b[0]\n"
- "sdot z25.s, z20.b, z5.b[2]\n"
- "sdot z24.s, z20.b, z4.b[0]\n"
- "sdot z23.s, z20.b, z4.b[2]\n"
- "sdot z26.s, z22.b, z5.b[1]\n"
- "sdot z25.s, z22.b, z5.b[3]\n"
- "sdot z24.s, z22.b, z4.b[1]\n"
- "sdot z23.s, z22.b, z4.b[3]\n"
- "sdot z18.s, z20.b, z6.b[0]\n"
- "sdot z17.s, z20.b, z6.b[2]\n"
- "sdot z16.s, z20.b, z7.b[0]\n"
- "sdot z21.s, z20.b, z7.b[2]\n"
- "sdot z18.s, z22.b, z6.b[1]\n"
- "sdot z17.s, z22.b, z6.b[3]\n"
- "sdot z16.s, z22.b, z7.b[1]\n"
- "sdot z21.s, z22.b, z7.b[3]\n"
- "sdot z19.s, z20.b, z1.b[0]\n"
- "mov z30.d, z30.d\n"
- "mov z28.d, z28.d\n"
- "add z30.s, z30.s, z29.s\n"
- "sdot z19.s, z22.b, z1.b[1]\n"
- "add z28.s, z28.s, z27.s\n"
- "add z30.s, z30.s, z26.s\n"
- "mov z29.d, z29.d\n"
- "add z28.s, z28.s, z25.s\n"
- "add z30.s, z30.s, z24.s\n"
- "mov z27.d, z27.d\n"
- "add z28.s, z28.s, z23.s\n"
- "add z30.s, z30.s, z18.s\n"
- "add z29.s, z29.s, z26.s\n"
- "add z28.s, z28.s, z17.s\n"
- "add z27.s, z27.s, z25.s\n"
- "add z29.s, z29.s, z24.s\n"
- "mov z26.d, z26.d\n"
- "add z27.s, z27.s, z23.s\n"
- "add z29.s, z29.s, z18.s\n"
- "mov z25.d, z25.d\n"
- "add z27.s, z27.s, z17.s\n"
- "add z29.s, z29.s, z16.s\n"
- "add z26.s, z26.s, z24.s\n"
- "add z27.s, z27.s, z21.s\n"
- "add z25.s, z25.s, z23.s\n"
- "add z26.s, z26.s, z18.s\n"
- "mov z24.d, z24.d\n"
- "add z25.s, z25.s, z17.s\n"
- "add z26.s, z26.s, z16.s\n"
- "mov z23.d, z23.d\n"
- "add z25.s, z25.s, z21.s\n"
- "add z26.s, z26.s, z19.s\n"
+ "sdot z20.s, z30.b, z1.b[2]\n"
+ "sdot z25.s, z30.b, z5.b[0]\n"
+ "sdot z27.s, z30.b, z5.b[2]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z19.s, z30.b, z6.b[0]\n"
+ "sdot z26.s, z29.b, z2.b[1]\n"
"add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "sdot z18.s, z20.b, z1.b[2]\n"
- "add z23.s, z23.s, z17.s\n"
- "mov z17.s, #0x0\n"
- "sdot z17.s, z20.b, z2.b[0]\n"
- "sdot z18.s, z22.b, z1.b[3]\n"
- "add z24.s, z24.s, z16.s\n"
+ "sdot z18.s, z30.b, z6.b[2]\n"
+ "sdot z22.s, z29.b, z2.b[3]\n"
+ "add z17.s, z28.s, z17.s\n"
+ "mov z16.s, #0x0\n"
+ "sdot z16.s, z30.b, z7.b[0]\n"
+ "sdot z21.s, z29.b, z1.b[1]\n"
+ "sdot z20.s, z29.b, z1.b[3]\n"
+ "add z28.s, z26.s, z24.s\n"
+ "sdot z25.s, z29.b, z5.b[1]\n"
+ "sdot z27.s, z29.b, z5.b[3]\n"
+ "add z31.s, z22.s, z17.s\n"
+ "sdot z19.s, z29.b, z6.b[1]\n"
+ "sdot z18.s, z29.b, z6.b[3]\n"
+ "add z22.s, z21.s, z28.s\n"
+ "sdot z16.s, z29.b, z7.b[1]\n"
+ "add z21.s, z20.s, z31.s\n"
+ "add z20.s, z25.s, z19.s\n"
+ "add z19.s, z27.s, z18.s\n"
+ "add z18.s, z16.s, z24.s\n"
+ "mov z16.s, #0x0\n"
+ "sdot z16.s, z30.b, z7.b[2]\n"
+ "sdot z16.s, z29.b, z7.b[3]\n"
+ "add z17.s, z16.s, z17.s\n"
"mov z16.s, #0x0\n"
- "sdot z17.s, z22.b, z2.b[1]\n"
- "sdot z16.s, z20.b, z2.b[2]\n"
- "add z25.s, z25.s, z18.s\n"
- "add z23.s, z23.s, z21.s\n"
- "add z24.s, z24.s, z19.s\n"
- "sdot z16.s, z22.b, z2.b[3]\n"
- "add z23.s, z23.s, z18.s\n"
- "add z24.s, z24.s, z17.s\n"
- "neg z15.s, p2/M, z15.s\n"
- "add z23.s, z23.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z15.s\n"
- "st1w { z30.s }, p2, [SP]\n"
- "add z30.s, z30.s, z13.s\n"
- "mul z28.s, p2/M, z28.s, z15.s\n"
- "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
- "add z28.s, z28.s, z13.s\n"
- "mul z29.s, p2/M, z29.s, z15.s\n"
- "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
- "add z29.s, z29.s, z13.s\n"
- "mul z27.s, p2/M, z27.s, z15.s\n"
- "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
- "add z27.s, z27.s, z13.s\n"
- "mul z26.s, p2/M, z26.s, z15.s\n"
- "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
- "add z26.s, z26.s, z13.s\n"
- "mul z25.s, p2/M, z25.s, z15.s\n"
- "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z13.s\n"
- "mul z24.s, p2/M, z24.s, z15.s\n"
- "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
- "add z24.s, z24.s, z13.s\n"
- "mul z23.s, p2/M, z23.s, z15.s\n"
- "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
- "add z23.s, z23.s, z13.s\n"
+ "sdot z16.s, z30.b, z0.b[0]\n"
+ "sdot z16.s, z29.b, z0.b[1]\n"
+ "add z24.s, z22.s, z16.s\n"
+ "add z26.s, z22.s, z25.s\n"
+ "mul z24.s, p2/M, z24.s, z23.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
+ "mov z16.s, #0x0\n"
+ "sdot z16.s, z30.b, z0.b[2]\n"
+ "sdot z16.s, z29.b, z0.b[3]\n"
+ "add z25.s, z21.s, z16.s\n"
+ "add z27.s, z21.s, z27.s\n"
+ "mul z25.s, p2/M, z25.s, z23.s\n"
+ "mul z27.s, p2/M, z27.s, z23.s\n"
+ "add z28.s, z20.s, z28.s\n"
+ "add z29.s, z19.s, z31.s\n"
+ "mul z28.s, p2/M, z28.s, z23.s\n"
+ "mul z29.s, p2/M, z29.s, z23.s\n"
+ "add z30.s, z18.s, z20.s\n"
+ "add z31.s, z17.s, z19.s\n"
+ "mul z30.s, p2/M, z30.s, z23.s\n"
+ "mul z31.s, p2/M, z31.s, z23.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
"1:" // Loop
- "sdot z30.s, z8.b, z0.b[0]\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "incb x28\n"
- "sdot z28.s, z8.b, z0.b[2]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "whilelt p0.s, x27, %x[n_channels]\n"
- "sdot z29.s, z8.b, z3.b[0]\n"
- "whilelt p1.b, x28, x9\n"
- "sdot z27.s, z8.b, z3.b[2]\n"
- "sdot z26.s, z8.b, z5.b[0]\n"
- "sdot z25.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z4.b[0]\n"
- "sdot z23.s, z8.b, z4.b[2]\n"
+ "sdot z24.s, z8.b, z0.b[0]\n"
+ "sdot z25.s, z8.b, z0.b[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z26.s, z8.b, z1.b[0]\n"
+ "sdot z27.s, z8.b, z1.b[2]\n"
+ "incb x9\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z24.s, z9.b, z0.b[1]\n"
+ "sdot z25.s, z9.b, z0.b[3]\n"
+ "whilelt p1.b, x9, x10\n"
+ "sdot z26.s, z9.b, z1.b[1]\n"
+ "sdot z27.s, z9.b, z1.b[3]\n"
+ "sdot z28.s, z8.b, z2.b[0]\n"
+ "sdot z29.s, z8.b, z2.b[2]\n"
+ "sdot z30.s, z8.b, z3.b[0]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
"ld1b { z8.b }, p2/Z, [%x[params]]\n"
- "sdot z30.s, z9.b, z0.b[1]\n"
- "sdot z28.s, z9.b, z0.b[3]\n"
- "sdot z29.s, z9.b, z3.b[1]\n"
- "sdot z27.s, z9.b, z3.b[3]\n"
- "sdot z26.s, z9.b, z5.b[1]\n"
- "sdot z25.s, z9.b, z5.b[3]\n"
- "sdot z24.s, z9.b, z4.b[1]\n"
- "sdot z23.s, z9.b, z4.b[3]\n"
+ "sdot z24.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "sdot z28.s, z9.b, z2.b[1]\n"
+ "sdot z29.s, z9.b, z2.b[3]\n"
+ "sdot z30.s, z9.b, z3.b[1]\n"
+ "sdot z31.s, z9.b, z3.b[3]\n"
"ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "sdot z28.s, z10.b, z3.b[2]\n"
- "sdot z29.s, z10.b, z5.b[0]\n"
- "sdot z27.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z4.b[0]\n"
- "sdot z25.s, z10.b, z4.b[2]\n"
- "sdot z24.s, z10.b, z6.b[0]\n"
- "sdot z23.s, z10.b, z6.b[2]\n"
+ "sdot z24.s, z11.b, z1.b[1]\n"
+ "sdot z25.s, z11.b, z1.b[3]\n"
+ "sdot z26.s, z11.b, z2.b[1]\n"
+ "sdot z27.s, z11.b, z2.b[3]\n"
+ "sdot z28.s, z10.b, z3.b[0]\n"
+ "sdot z29.s, z10.b, z3.b[2]\n"
+ "sdot z30.s, z10.b, z4.b[0]\n"
+ "sdot z31.s, z10.b, z4.b[2]\n"
"ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sdot z30.s, z11.b, z3.b[1]\n"
- "sdot z28.s, z11.b, z3.b[3]\n"
- "sdot z29.s, z11.b, z5.b[1]\n"
- "sdot z27.s, z11.b, z5.b[3]\n"
- "sdot z26.s, z11.b, z4.b[1]\n"
- "sdot z25.s, z11.b, z4.b[3]\n"
- "sdot z24.s, z11.b, z6.b[1]\n"
- "sdot z23.s, z11.b, z6.b[3]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z25.s, z8.b, z2.b[2]\n"
+ "sdot z26.s, z8.b, z3.b[0]\n"
+ "sdot z27.s, z8.b, z3.b[2]\n"
+ "sdot z28.s, z11.b, z3.b[1]\n"
+ "sdot z29.s, z11.b, z3.b[3]\n"
+ "sdot z30.s, z11.b, z4.b[1]\n"
+ "sdot z31.s, z11.b, z4.b[3]\n"
"ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z9.b, z2.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[3]\n"
+ "sdot z26.s, z9.b, z3.b[1]\n"
+ "sdot z27.s, z9.b, z3.b[3]\n"
+ "sdot z28.s, z8.b, z4.b[0]\n"
+ "sdot z29.s, z8.b, z4.b[2]\n"
"sdot z30.s, z8.b, z5.b[0]\n"
- "sdot z28.s, z8.b, z5.b[2]\n"
- "sdot z29.s, z8.b, z4.b[0]\n"
- "sdot z27.s, z8.b, z4.b[2]\n"
- "sdot z26.s, z8.b, z6.b[0]\n"
- "sdot z25.s, z8.b, z6.b[2]\n"
- "sdot z24.s, z8.b, z7.b[0]\n"
- "sdot z23.s, z8.b, z7.b[2]\n"
+ "sdot z31.s, z8.b, z5.b[2]\n"
"ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z24.s, z10.b, z3.b[0]\n"
+ "sdot z25.s, z10.b, z3.b[2]\n"
+ "sdot z26.s, z10.b, z4.b[0]\n"
+ "sdot z27.s, z10.b, z4.b[2]\n"
+ "sdot z28.s, z9.b, z4.b[1]\n"
+ "sdot z29.s, z9.b, z4.b[3]\n"
"sdot z30.s, z9.b, z5.b[1]\n"
- "sdot z28.s, z9.b, z5.b[3]\n"
- "sdot z29.s, z9.b, z4.b[1]\n"
- "sdot z27.s, z9.b, z4.b[3]\n"
- "sdot z26.s, z9.b, z6.b[1]\n"
- "sdot z25.s, z9.b, z6.b[3]\n"
- "sdot z24.s, z9.b, z7.b[1]\n"
- "sdot z23.s, z9.b, z7.b[3]\n"
+ "sdot z31.s, z9.b, z5.b[3]\n"
"ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z30.s, z10.b, z4.b[0]\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "sdot z28.s, z10.b, z4.b[2]\n"
- "sdot z29.s, z10.b, z6.b[0]\n"
- "sdot z27.s, z10.b, z6.b[2]\n"
- "sdot z26.s, z10.b, z7.b[0]\n"
- "sdot z25.s, z10.b, z7.b[2]\n"
- "sdot z24.s, z10.b, z1.b[0]\n"
- "sdot z23.s, z10.b, z1.b[2]\n"
+ "sdot z24.s, z11.b, z3.b[1]\n"
+ "sdot z25.s, z11.b, z3.b[3]\n"
+ "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z26.s, z11.b, z4.b[1]\n"
+ "sdot z27.s, z11.b, z4.b[3]\n"
+ "sdot z28.s, z10.b, z5.b[0]\n"
+ "sdot z29.s, z10.b, z5.b[2]\n"
+ "sdot z30.s, z10.b, z6.b[0]\n"
+ "sdot z31.s, z10.b, z6.b[2]\n"
"ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z30.s, z11.b, z4.b[1]\n"
- "sdot z28.s, z11.b, z4.b[3]\n"
- "sdot z29.s, z11.b, z6.b[1]\n"
- "sdot z27.s, z11.b, z6.b[3]\n"
- "sdot z26.s, z11.b, z7.b[1]\n"
- "sdot z25.s, z11.b, z7.b[3]\n"
- "sdot z24.s, z11.b, z1.b[1]\n"
- "sdot z23.s, z11.b, z1.b[3]\n"
+ "sdot z24.s, z8.b, z4.b[0]\n"
+ "sdot z25.s, z8.b, z4.b[2]\n"
+ "sdot z26.s, z8.b, z5.b[0]\n"
+ "sdot z27.s, z8.b, z5.b[2]\n"
+ "sdot z28.s, z11.b, z5.b[1]\n"
+ "sdot z29.s, z11.b, z5.b[3]\n"
+ "sdot z30.s, z11.b, z6.b[1]\n"
+ "sdot z31.s, z11.b, z6.b[3]\n"
"ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "sdot z30.s, z8.b, z6.b[0]\n"
- "sdot z28.s, z8.b, z6.b[2]\n"
- "sdot z29.s, z8.b, z7.b[0]\n"
- "sdot z27.s, z8.b, z7.b[2]\n"
- "sdot z26.s, z8.b, z1.b[0]\n"
- "sdot z25.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z23.s, z8.b, z2.b[2]\n"
+ "sdot z24.s, z9.b, z4.b[1]\n"
+ "sdot z25.s, z9.b, z4.b[3]\n"
+ ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
+ "sdot z26.s, z9.b, z5.b[1]\n"
+ "sdot z27.s, z9.b, z5.b[3]\n"
+ ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
+ "sdot z28.s, z8.b, z6.b[0]\n"
+ "sdot z29.s, z8.b, z6.b[2]\n"
+ ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
+ "sdot z30.s, z8.b, z7.b[0]\n"
+ "sdot z31.s, z8.b, z7.b[2]\n"
+ ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
"ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "sdot z30.s, z9.b, z6.b[1]\n"
- "sdot z28.s, z9.b, z6.b[3]\n"
- "sdot z29.s, z9.b, z7.b[1]\n"
- "sdot z27.s, z9.b, z7.b[3]\n"
- "sdot z26.s, z9.b, z1.b[1]\n"
- "sdot z25.s, z9.b, z1.b[3]\n"
- "sdot z24.s, z9.b, z2.b[1]\n"
- "sdot z23.s, z9.b, z2.b[3]\n"
+ "sdot z28.s, z9.b, z6.b[1]\n"
+ "sdot z29.s, z9.b, z6.b[3]\n"
+ "and z16.d, z24.d, z19.d\n"
+ "sdot z30.s, z9.b, z7.b[1]\n"
+ "sdot z31.s, z9.b, z7.b[3]\n"
+ "and z18.d, z25.d, z19.d\n"
"ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n"
- ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
- ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
- ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n"
- ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
- "and z20.d, z30.d, z21.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "and z17.d, z27.d, z21.d\n"
- "and z16.d, z26.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "addvl %x[params], %x[params], #-3\n"
+ ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
+ ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
+ ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
+ ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
+ "and z17.d, z26.d, z19.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z30.s, z30.s, z20.s\n"
- ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ "and z16.d, z27.d, z19.d\n"
+ ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
+ ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
+ "and z16.d, z28.d, z19.d\n"
+ "and z18.d, z29.d, z19.d\n"
+ "and z17.d, z30.d, z19.d\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- "sqadd z28.s, z28.s, z19.s\n"
- "sqadd z29.s, z29.s, z18.s\n"
- "and z18.d, z25.d, z21.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z27.s, z27.s, z17.s\n"
- "sqadd z26.s, z26.s, z16.s\n"
- "and z17.d, z24.d, z21.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "and z16.d, z23.d, z21.d\n"
- ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ "sqadd z28.s, z28.s, z16.s\n"
+ "and z16.d, z31.d, z19.d\n"
+ ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "add z30.s, z30.s, z14.s\n"
- "sqadd z24.s, z24.s, z17.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
+ ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "smin z24.s, p2/M, z24.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z27.s, p2/M, z27.s, z15.s\n"
"add z28.s, z28.s, z14.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "smin z30.s, p2/M, z30.s, z12.s\n"
"add z29.s, z29.s, z14.s\n"
- "smin z28.s, p2/M, z28.s, z12.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "smax z30.s, p2/M, z30.s, z31.s\n"
- "st1b { z30.s }, p0, [x26, x27]\n"
- "add z27.s, z27.s, z14.s\n"
- "smax z28.s, p2/M, z28.s, z31.s\n"
- "ld1w { z30.s }, p2/Z, [SP]\n"
- "smin z29.s, p2/M, z29.s, z12.s\n"
- "st1b { z28.s }, p0, [x25, x27]\n"
- "add z30.s, z30.s, z13.s\n"
- "smin z27.s, p2/M, z27.s, z12.s\n"
- "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
- "smax z29.s, p2/M, z29.s, z31.s\n"
- "st1b { z29.s }, p0, [x24, x27]\n"
- "add z28.s, z28.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z31.s\n"
- "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "st1b { z27.s }, p0, [x23, x27]\n"
- "add z29.s, z29.s, z13.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
- "add z26.s, z26.s, z14.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
- "add z25.s, z25.s, z14.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z23.s, z23.s, z14.s\n"
- "smin z26.s, p2/M, z26.s, z12.s\n"
- "smin z25.s, p2/M, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z12.s\n"
- "smin z23.s, p2/M, z23.s, z12.s\n"
- "smax z26.s, p2/M, z26.s, z31.s\n"
- "st1b { z26.s }, p0, [x22, x27]\n"
- "smax z25.s, p2/M, z25.s, z31.s\n"
- "smax z24.s, p2/M, z24.s, z31.s\n"
- "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
- "smax z23.s, p2/M, z23.s, z31.s\n"
- "st1b { z25.s }, p0, [x21, x27]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z24.s }, p0, [x20, x27]\n"
- "st1b { z23.s }, p0, [x19, x27]\n"
- "incw x27\n"
- "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z13.s\n"
- "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
- "add z24.s, z24.s, z13.s\n"
- "add z23.s, z23.s, z13.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "smin z30.s, p2/M, z30.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z15.s\n"
+ "smax z24.s, p2/M, z24.s, z13.s\n"
+ "smax z25.s, p2/M, z25.s, z13.s\n"
+ "st1b { z24.s }, p0, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z13.s\n"
+ "smax z27.s, p2/M, z27.s, z13.s\n"
+ "st1b { z25.s }, p0, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z13.s\n"
+ "smax z29.s, p2/M, z29.s, z13.s\n"
+ "st1b { z26.s }, p0, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z13.s\n"
+ "smax z31.s, p2/M, z31.s, z13.s\n"
+ "st1b { z27.s }, p0, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p0, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z12.s\n"
+ "st1b { z29.s }, p0, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z12.s\n"
+ "st1b { z30.s }, p0, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z12.s\n"
+ "st1b { z31.s }, p0, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 391e98b561..6a432e1961 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,352 +41,400 @@ void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "mov x25, #0x0\n"
- "ldp x24, x23, [%x[inptrs], #0x30]\n"
- "whilelt p1.b, x25, %x[n_channels]\n"
- "ldp x22, x21, [%x[outptrs], #0x0]\n"
- "ldp x20, x19, [%x[outptrs], #0x10]\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "mov x13, #0x0\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "ptrue p1.b\n"
+ "mov x24, #0x0\n"
+ "ldp x23, x22, [%x[outptrs], #0x0]\n"
+ "ldp x21, x20, [%x[outptrs], #0x10]\n"
+ "ld1b { z9.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z8.b }, p2/Z, [x11, x13]\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "ld1b { z7.b }, p2/Z, [x10, x13]\n"
+ "zip2 z6.b, z9.b, z7.b\n"
+ "zip1 z9.b, z9.b, z7.b\n"
+ "ld1b { z5.b }, p2/Z, [x9, x13]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "zip1 z7.b, z8.b, z5.b\n"
+ "zip2 z5.b, z8.b, z5.b\n"
+ "ld1b { z4.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z3.b }, p2/Z, [x27, x13]\n"
+ "zip2 z8.b, z9.b, z7.b\n"
+ "zip1 z9.b, z9.b, z7.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ld1b { z2.b }, p2/Z, [x26, x13]\n"
+ "zip1 z7.b, z6.b, z5.b\n"
+ "zip2 z5.b, z6.b, z5.b\n"
+ "ld1b { z1.b }, p2/Z, [x25, x13]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip2 z0.b, z4.b, z2.b\n"
+ "zip1 z4.b, z4.b, z2.b\n"
+ "ld1b { z31.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z30.b }, p2/Z, [x11, x13]\n"
+ "zip1 z2.b, z3.b, z1.b\n"
+ "zip2 z1.b, z3.b, z1.b\n"
+ "ld1b { z29.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z28.b }, p2/Z, [x9, x13]\n"
+ "zip2 z27.b, z31.b, z29.b\n"
+ "zip1 z31.b, z31.b, z29.b\n"
+ "ld1b { z26.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z25.b }, p2/Z, [x27, x13]\n"
+ "zip1 z29.b, z30.b, z28.b\n"
+ "zip2 z28.b, z30.b, z28.b\n"
+ "ld1b { z24.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z23.b }, p2/Z, [x25, x13]\n"
+ "zip2 z22.b, z26.b, z24.b\n"
+ "zip1 z26.b, z26.b, z24.b\n"
+ "zip1 z24.b, z25.b, z23.b\n"
+ "zip2 z23.b, z25.b, z23.b\n"
+ "ld1w { z6.s }, p1/Z, [%x[params]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z19.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip2 z3.b, z4.b, z2.b\n"
+ "zip1 z4.b, z4.b, z2.b\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "zip1 z2.b, z0.b, z1.b\n"
+ "zip2 z1.b, z0.b, z1.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip2 z30.b, z31.b, z29.b\n"
+ "zip1 z31.b, z31.b, z29.b\n"
+ "zip1 z29.b, z27.b, z28.b\n"
+ "zip2 z28.b, z27.b, z28.b\n"
+ "ld1b { z18.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z17.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "zip2 z25.b, z26.b, z24.b\n"
+ "zip1 z26.b, z26.b, z24.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "zip1 z24.b, z22.b, z23.b\n"
+ "zip2 z23.b, z22.b, z23.b\n"
+ "mov z0.d, z6.d\n"
+ "mov z27.d, z6.d\n"
+ "mov z22.d, z6.d\n"
"1:" // Loop
- "ld1b { z19.b }, p1/Z, [x11, x25]\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
- "ld1b { z18.b }, p1/Z, [x10, x25]\n"
- "ldp x11, x10, [%x[inptrs], #0x40]\n"
- "ld1b { z16.b }, p1/Z, [x9, x25]\n"
- "zip1 z21.b, z19.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x28, x25]\n"
- "zip2 z19.b, z19.b, z16.b\n"
- "ldp x9, x28, [%x[inptrs], #0x50]\n"
- "ld1b { z23.b }, p1/Z, [x27, x25]\n"
- "zip1 z16.b, z18.b, z17.b\n"
- "ld1b { z20.b }, p1/Z, [x26, x25]\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x60]\n"
- "zip1 z3.b, z21.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x24, x25]\n"
- "zip2 z2.b, z21.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x23, x25]\n"
- "zip1 z29.b, z19.b, z18.b\n"
- "ldp x24, x23, [%x[inptrs], #0x70]\n"
- "zip2 z28.b, z19.b, z18.b\n"
- "ld1b { z22.b }, p1/Z, [x11, x25]\n"
- "zip1 z19.b, z23.b, z17.b\n"
- "ld1b { z21.b }, p1/Z, [x10, x25]\n"
- "zip2 z27.b, z23.b, z17.b\n"
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "zip1 z18.b, z20.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x9, x25]\n"
- "zip2 z20.b, z20.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x28, x25]\n"
- "zip1 z1.b, z19.b, z18.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "zip2 z0.b, z19.b, z18.b\n"
- "ld1b { z19.b }, p1/Z, [x27, x25]\n"
- "zip1 z26.b, z22.b, z17.b\n"
- "ld1b { z25.b }, p1/Z, [x26, x25]\n"
- "zip2 z24.b, z22.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "zip1 z23.b, z21.b, z16.b\n"
- "ld1b { z18.b }, p1/Z, [x24, x25]\n"
- "zip2 z22.b, z21.b, z16.b\n"
- "ld1b { z21.b }, p1/Z, [x23, x25]\n"
- "zip1 z17.b, z27.b, z20.b\n"
- "ldp x24, x23, [%x[inptrs], #0x30]\n"
- "zip2 z16.b, z27.b, z20.b\n"
- "st1b { z29.b }, p2, [SP]\n"
- "zip1 z20.b, z19.b, z18.b\n"
- "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
- "zip2 z19.b, z19.b, z18.b\n"
- "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
- "zip1 z18.b, z25.b, z21.b\n"
- "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
- "zip2 z17.b, z25.b, z21.b\n"
- "ld1w { z31.s }, p2/Z, [%x[params]]\n"
- "zip1 z30.b, z26.b, z23.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z28.b, z26.b, z23.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
- "zip2 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
- "zip1 z26.b, z20.b, z18.b\n"
- "ld1b { z25.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "zip2 z24.b, z20.b, z18.b\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
- "zip2 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
- "mov z22.d, z31.d\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "mov z20.d, z31.d\n"
- "mov z19.d, z31.d\n"
- "sdot z31.s, z29.b, z3.b\n"
- "sdot z20.s, z29.b, z1.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "sdot z31.s, z27.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z20.s, z27.b, z30.b\n"
- "sdot z22.s, z29.b, z3.b\n"
- "ld1b { z3.b }, p2/Z, [SP]\n"
- "sdot z31.s, z25.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z20.s, z25.b, z26.b\n"
+ "sdot z6.s, z18.b, z9.b\n"
+ "sdot z27.s, z18.b, z4.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "whilelt p0.s, x24, %x[n_channels]\n"
+ "sdot z6.s, z17.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "sdot z0.s, z18.b, z9.b\n"
+ "ld1w { z9.s }, p1/Z, [%x[params]]\n"
+ "sdot z22.s, z18.b, z4.b\n"
+ "sdot z27.s, z17.b, z31.b\n"
+ "incw x13, ALL, MUL #4\n"
+ "sdot z6.s, z16.b, z31.b\n"
+ "ext z31.b, z31.b, z31.b, #0x1\n"
+ "sdot z0.s, z17.b, z4.b\n"
+ "ld1w { z4.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z22.s, z17.b, z31.b\n"
+ "sdot z27.s, z16.b, z26.b\n"
"ext z26.b, z26.b, z26.b, #0x1\n"
- "sdot z19.s, z29.b, z1.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #7, MUL VL]\n"
- "sdot z22.s, z27.b, z1.b\n"
- "ld1b { z1.b }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
- "sdot z19.s, z27.b, z30.b\n"
- "sdot z22.s, z25.b, z30.b\n"
- "ld1b { z30.b }, p2/Z, [SP, #4, MUL VL]\n"
- "and z16.d, z31.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sdot z19.s, z25.b, z26.b\n"
- "ld1b { z26.b }, p2/Z, [SP, #6, MUL VL]\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- "and z18.d, z20.d, z21.d\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sdot z0.s, z16.b, z31.b\n"
+ "sdot z22.s, z16.b, z26.b\n"
+ "and z18.d, z6.d, z4.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
- "sqadd z31.s, z31.s, z16.s\n"
- "and z17.d, z22.d, z21.d\n"
+ ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
+ ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
+ ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
+ "ld1w { z9.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "and z17.d, z0.d, z4.d\n"
+ "and z16.d, z27.d, z4.d\n"
+ "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "and z16.d, z19.d, z21.d\n"
- "sqadd z20.s, z20.s, z18.s\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
- "add z31.s, z31.s, z4.s\n"
- "sqadd z19.s, z19.s, z16.s\n"
- "add z20.s, z20.s, z4.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "smax z31.s, p2/M, z31.s, z6.s\n"
- "smax z20.s, p2/M, z20.s, z6.s\n"
- ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
- "add z22.s, z22.s, z4.s\n"
- "smin z31.s, p2/M, z31.s, z5.s\n"
- "st1b { z31.s }, p0, [x22, x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "smax z22.s, p2/M, z22.s, z6.s\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "smin z20.s, p2/M, z20.s, z5.s\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z25.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smax z19.s, p2/M, z19.s, z6.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
- "smin z22.s, p2/M, z22.s, z5.s\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "smin z19.s, p2/M, z19.s, z5.s\n"
- "st1b { z20.s }, p0, [x20, x25]\n"
- "mov z20.d, z31.d\n"
- "st1b { z22.s }, p0, [x21, x25]\n"
- "mov z22.d, z31.d\n"
- "st1b { z19.s }, p0, [x19, x25]\n"
- "mov z19.d, z31.d\n"
- "incw x25\n"
- "sdot z31.s, z29.b, z2.b\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
- "sdot z20.s, z29.b, z0.b\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z31.s, z27.b, z0.b\n"
- "sdot z20.s, z27.b, z28.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z22.s, z29.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [SP, #1, MUL VL]\n"
- "sdot z31.s, z25.b, z28.b\n"
- "sdot z20.s, z25.b, z24.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "ext z24.b, z24.b, z24.b, #0x1\n"
- "sdot z19.s, z29.b, z0.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z22.s, z27.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [SP, #3, MUL VL]\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
- "sdot z19.s, z27.b, z28.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "sdot z22.s, z25.b, z28.b\n"
- "ld1b { z28.b }, p2/Z, [SP, #5, MUL VL]\n"
- "and z16.d, z31.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sdot z19.s, z25.b, z24.b\n"
- "ld1b { z25.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- "ld1b { z24.b }, p2/Z, [SP, #7, MUL VL]\n"
- "and z18.d, z20.d, z21.d\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params]]\n"
- "sqadd z31.s, z31.s, z16.s\n"
- "and z17.d, z22.d, z21.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "and z16.d, z19.d, z21.d\n"
- "sqadd z20.s, z20.s, z18.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
- "add z31.s, z31.s, z4.s\n"
- "sqadd z19.s, z19.s, z16.s\n"
- "add z20.s, z20.s, z4.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "smax z31.s, p2/M, z31.s, z6.s\n"
- "smax z20.s, p2/M, z20.s, z6.s\n"
- ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "add z22.s, z22.s, z4.s\n"
- "smin z31.s, p2/M, z31.s, z5.s\n"
- "st1b { z31.s }, p0, [x22, x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "smax z22.s, p2/M, z22.s, z6.s\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "smin z20.s, p2/M, z20.s, z5.s\n"
- "st1b { z20.s }, p0, [x20, x25]\n"
- "mov z20.d, z31.d\n"
- "smin z22.s, p2/M, z22.s, z5.s\n"
- "st1b { z22.s }, p0, [x21, x25]\n"
- "mov z22.d, z31.d\n"
- "sdot z20.s, z29.b, z1.b\n"
- "smax z19.s, p2/M, z19.s, z6.s\n"
- "sdot z20.s, z27.b, z30.b\n"
- "smin z19.s, p2/M, z19.s, z5.s\n"
- "st1b { z19.s }, p0, [x19, x25]\n"
- "mov z19.d, z31.d\n"
- "incw x25\n"
- "sdot z31.s, z29.b, z3.b\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
- "sdot z20.s, z25.b, z26.b\n"
+ "sqadd z0.s, z0.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
+ ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ "add z6.s, z6.s, z19.s\n"
+ ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
+ "smax z6.s, p1/M, z6.s, z21.s\n"
+ "add z0.s, z0.s, z19.s\n"
+ "add z27.s, z27.s, z19.s\n"
+ "smin z6.s, p1/M, z6.s, z20.s\n"
+ "smax z0.s, p1/M, z0.s, z21.s\n"
+ "add z22.s, z22.s, z19.s\n"
+ "smax z27.s, p1/M, z27.s, z21.s\n"
+ "smax z22.s, p1/M, z22.s, z21.s\n"
+ "st1b { z6.s }, p0, [x23, x24]\n"
+ "ld1w { z6.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "smin z0.s, p1/M, z0.s, z20.s\n"
+ "smin z27.s, p1/M, z27.s, z20.s\n"
+ "smin z22.s, p1/M, z22.s, z20.s\n"
+ "st1b { z0.s }, p0, [x22, x24]\n"
+ "mov z0.d, z6.d\n"
+ "ld1b { z17.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z27.s }, p0, [x21, x24]\n"
+ "mov z27.d, z6.d\n"
+ "sdot z27.s, z18.b, z3.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z22.s }, p0, [x20, x24]\n"
+ "mov z22.d, z6.d\n"
+ "sdot z6.s, z18.b, z8.b\n"
+ "sdot z6.s, z17.b, z3.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
"ext z3.b, z3.b, z3.b, #0x1\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- "sdot z31.s, z27.b, z1.b\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z22.s, z29.b, z3.b\n"
- ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
- "sdot z31.s, z25.b, z30.b\n"
+ "sdot z0.s, z18.b, z8.b\n"
+ "ld1w { z4.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z22.s, z18.b, z3.b\n"
+ "sdot z27.s, z17.b, z30.b\n"
+ "incw x24\n"
+ "whilelt p0.s, x24, %x[n_channels]\n"
+ "sdot z6.s, z16.b, z30.b\n"
"ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z19.s, z29.b, z1.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z22.s, z27.b, z1.b\n"
- "and z18.d, z20.d, z21.d\n"
+ "sdot z0.s, z17.b, z3.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z22.s, z17.b, z30.b\n"
+ "sdot z27.s, z16.b, z25.b\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sdot z0.s, z16.b, z30.b\n"
+ "sdot z22.s, z16.b, z25.b\n"
+ "and z18.d, z6.d, z4.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sdot z19.s, z27.b, z30.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "sdot z22.s, z25.b, z30.b\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- "sdot z19.s, z25.b, z26.b\n"
- "ld1b { z25.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "and z16.d, z31.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- "sqadd z20.s, z20.s, z18.s\n"
- ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z17.d, z22.d, z21.d\n"
+ ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
+ ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
+ ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
+ "ld1w { z9.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "and z17.d, z0.d, z4.d\n"
+ "and z16.d, z27.d, z4.d\n"
+ "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z31.s, z31.s, z16.s\n"
- "and z16.d, z19.d, z21.d\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "add z20.s, z20.s, z4.s\n"
- "add z31.s, z31.s, z4.s\n"
- "sqadd z19.s, z19.s, z16.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "smax z20.s, p2/M, z20.s, z6.s\n"
- "smax z31.s, p2/M, z31.s, z6.s\n"
- ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "add z22.s, z22.s, z4.s\n"
- "smin z20.s, p2/M, z20.s, z5.s\n"
- "st1b { z20.s }, p0, [x20, x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "smin z31.s, p2/M, z31.s, z5.s\n"
- "st1b { z31.s }, p0, [x22, x25]\n"
- "smax z22.s, p2/M, z22.s, z6.s\n"
- "smax z19.s, p2/M, z19.s, z6.s\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "addvl %x[params], %x[params], #8\n"
- "mov z20.d, z31.d\n"
- "smin z22.s, p2/M, z22.s, z5.s\n"
- "st1b { z22.s }, p0, [x21, x25]\n"
- "mov z22.d, z31.d\n"
- "sdot z20.s, z29.b, z0.b\n"
- "smin z19.s, p2/M, z19.s, z5.s\n"
- "st1b { z19.s }, p0, [x19, x25]\n"
- "mov z19.d, z31.d\n"
- "incw x25\n"
- "sdot z31.s, z29.b, z2.b\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
- "sdot z20.s, z27.b, z28.b\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
+ ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ "add z6.s, z6.s, z19.s\n"
+ ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
+ "smax z6.s, p1/M, z6.s, z21.s\n"
+ "add z0.s, z0.s, z19.s\n"
+ "add z27.s, z27.s, z19.s\n"
+ "smin z6.s, p1/M, z6.s, z20.s\n"
+ "smax z0.s, p1/M, z0.s, z21.s\n"
+ "add z22.s, z22.s, z19.s\n"
+ "smax z27.s, p1/M, z27.s, z21.s\n"
+ "smax z22.s, p1/M, z22.s, z21.s\n"
+ "st1b { z6.s }, p0, [x23, x24]\n"
+ "ld1w { z6.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z0.s, p1/M, z0.s, z20.s\n"
+ "smin z27.s, p1/M, z27.s, z20.s\n"
+ "smin z22.s, p1/M, z22.s, z20.s\n"
+ "st1b { z0.s }, p0, [x22, x24]\n"
+ "mov z0.d, z6.d\n"
+ "ld1b { z17.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "st1b { z27.s }, p0, [x21, x24]\n"
+ "mov z27.d, z6.d\n"
+ "sdot z27.s, z18.b, z2.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "st1b { z22.s }, p0, [x20, x24]\n"
+ "mov z22.d, z6.d\n"
+ "sdot z6.s, z18.b, z7.b\n"
+ "sdot z6.s, z17.b, z2.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z31.s, z27.b, z0.b\n"
- "sdot z20.s, z25.b, z24.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z0.s, z18.b, z7.b\n"
+ "ld1w { z4.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z22.s, z18.b, z2.b\n"
+ "sdot z27.s, z17.b, z29.b\n"
+ "incw x24\n"
+ "whilelt p0.s, x24, %x[n_channels]\n"
+ "sdot z6.s, z16.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "sdot z0.s, z17.b, z2.b\n"
+ "sdot z22.s, z17.b, z29.b\n"
+ "sdot z27.s, z16.b, z24.b\n"
"ext z24.b, z24.b, z24.b, #0x1\n"
- "sdot z22.s, z29.b, z2.b\n"
- "sdot z31.s, z25.b, z28.b\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "sdot z19.s, z29.b, z0.b\n"
- "sdot z22.s, z27.b, z0.b\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
- "sdot z19.s, z27.b, z28.b\n"
- "sdot z22.s, z25.b, z28.b\n"
- "and z16.d, z31.d, z21.d\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sdot z0.s, z16.b, z29.b\n"
+ "sdot z22.s, z16.b, z24.b\n"
+ "and z18.d, z6.d, z4.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
+ ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
+ ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
+ "ld1w { z9.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "and z17.d, z0.d, z4.d\n"
+ "and z16.d, z27.d, z4.d\n"
+ "and z18.d, z22.d, z4.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "sdot z19.s, z25.b, z24.b\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- "and z18.d, z20.d, z21.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "and z17.d, z22.d, z21.d\n"
- ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ "sqadd z0.s, z0.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
+ ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ "add z6.s, z6.s, z19.s\n"
+ ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
+ "smax z6.s, p1/M, z6.s, z21.s\n"
+ "add z0.s, z0.s, z19.s\n"
+ "add z27.s, z27.s, z19.s\n"
+ "smin z6.s, p1/M, z6.s, z20.s\n"
+ "smax z0.s, p1/M, z0.s, z21.s\n"
+ "add z22.s, z22.s, z19.s\n"
+ "smax z27.s, p1/M, z27.s, z21.s\n"
+ "smax z22.s, p1/M, z22.s, z21.s\n"
+ "st1b { z6.s }, p0, [x23, x24]\n"
+ "ld1w { z6.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z18.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z0.s, p1/M, z0.s, z20.s\n"
+ "smin z27.s, p1/M, z27.s, z20.s\n"
+ "smin z22.s, p1/M, z22.s, z20.s\n"
+ "st1b { z0.s }, p0, [x22, x24]\n"
+ "mov z0.d, z6.d\n"
+ "ld1b { z17.b }, p1/Z, [%x[params]]\n"
+ "st1b { z27.s }, p0, [x21, x24]\n"
+ "mov z27.d, z6.d\n"
+ "sdot z27.s, z18.b, z1.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "st1b { z22.s }, p0, [x20, x24]\n"
+ "mov z22.d, z6.d\n"
+ "sdot z6.s, z18.b, z5.b\n"
+ "sdot z6.s, z17.b, z1.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z0.s, z18.b, z5.b\n"
+ "ld1w { z4.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z22.s, z18.b, z1.b\n"
+ "sdot z27.s, z17.b, z28.b\n"
+ "incw x24\n"
+ "whilelt p0.s, x24, %x[n_channels]\n"
+ "sdot z6.s, z16.b, z28.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "sdot z0.s, z17.b, z1.b\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "sdot z22.s, z17.b, z28.b\n"
+ "sdot z27.s, z16.b, z23.b\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
+ "ld1b { z8.b }, p2/Z, [x11, x13]\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sdot z0.s, z16.b, z28.b\n"
+ "sdot z22.s, z16.b, z23.b\n"
+ "ld1b { z7.b }, p2/Z, [x10, x13]\n"
+ "and z18.d, z6.d, z4.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "ld1b { z5.b }, p2/Z, [x9, x13]\n"
+ "ld1b { z3.b }, p2/Z, [x27, x13]\n"
+ ".inst 0x04a97400 // sqrdmulh z0.s, z0.s, z9.s\n"
+ ".inst 0x04a9777b // sqrdmulh z27.s, z27.s, z9.s\n"
+ "ld1b { z2.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z1.b }, p2/Z, [x25, x13]\n"
+ ".inst 0x04a976d6 // sqrdmulh z22.s, z22.s, z9.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x44828486 // srshl z6.s, p1/M, z6.s, z4.s\n"
+ "ld1b { z9.b }, p2/Z, [x12, x13]\n"
+ "and z17.d, z0.d, z4.d\n"
+ "and z16.d, z27.d, z4.d\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "and z18.d, z22.d, z4.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z31.s, z31.s, z16.s\n"
- "and z16.d, z19.d, z21.d\n"
+ "ld1b { z31.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z30.b }, p2/Z, [x11, x13]\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z20.s, z20.s, z18.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "add z31.s, z31.s, z4.s\n"
- ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
- "sqadd z19.s, z19.s, z16.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "smax z31.s, p2/M, z31.s, z6.s\n"
- "add z20.s, z20.s, z4.s\n"
- ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
- "add z22.s, z22.s, z4.s\n"
- "smin z31.s, p2/M, z31.s, z5.s\n"
- "st1b { z31.s }, p0, [x22, x25]\n"
- "add z19.s, z19.s, z4.s\n"
- "smax z22.s, p2/M, z22.s, z6.s\n"
- "smax z20.s, p2/M, z20.s, z6.s\n"
- "smax z19.s, p2/M, z19.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z5.s\n"
- "st1b { z22.s }, p0, [x21, x25]\n"
- "smin z20.s, p2/M, z20.s, z5.s\n"
- "smin z19.s, p2/M, z19.s, z5.s\n"
- "st1b { z20.s }, p0, [x20, x25]\n"
- "st1b { z19.s }, p0, [x19, x25]\n"
- "incw x25\n"
- "whilelt p1.b, x25, %x[n_channels]\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "ld1b { z29.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z28.b }, p2/Z, [x9, x13]\n"
+ "sqadd z0.s, z0.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828480 // srshl z0.s, p1/M, z0.s, z4.s\n"
+ ".inst 0x4482849b // srshl z27.s, p1/M, z27.s, z4.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ "add z6.s, z6.s, z19.s\n"
+ ".inst 0x44828496 // srshl z22.s, p1/M, z22.s, z4.s\n"
+ "smax z6.s, p1/M, z6.s, z21.s\n"
+ "add z0.s, z0.s, z19.s\n"
+ "add z27.s, z27.s, z19.s\n"
+ "ld1b { z4.b }, p2/Z, [x28, x13]\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "add z22.s, z22.s, z19.s\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "smin z6.s, p1/M, z6.s, z20.s\n"
+ "smax z0.s, p1/M, z0.s, z21.s\n"
+ "smax z27.s, p1/M, z27.s, z21.s\n"
+ "smax z22.s, p1/M, z22.s, z21.s\n"
+ "st1b { z6.s }, p0, [x23, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z25.b }, p2/Z, [x27, x13]\n"
+ "ld1b { z24.b }, p2/Z, [x26, x13]\n"
+ "zip2 z6.b, z9.b, z7.b\n"
+ "zip1 z9.b, z9.b, z7.b\n"
+ "ld1b { z23.b }, p2/Z, [x25, x13]\n"
+ "zip1 z7.b, z8.b, z5.b\n"
+ "zip2 z5.b, z8.b, z5.b\n"
+ "smin z0.s, p1/M, z0.s, z20.s\n"
+ "smin z27.s, p1/M, z27.s, z20.s\n"
+ "smin z22.s, p1/M, z22.s, z20.s\n"
+ "st1b { z0.s }, p0, [x22, x24]\n"
+ "zip2 z8.b, z9.b, z7.b\n"
+ "st1b { z27.s }, p0, [x21, x24]\n"
+ "zip1 z9.b, z9.b, z7.b\n"
+ "zip1 z7.b, z6.b, z5.b\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "st1b { z22.s }, p0, [x20, x24]\n"
+ "zip2 z5.b, z6.b, z5.b\n"
+ "zip2 z0.b, z4.b, z2.b\n"
+ "ld1w { z6.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z4.b, z4.b, z2.b\n"
+ "zip1 z2.b, z3.b, z1.b\n"
+ "incw x24\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "zip2 z1.b, z3.b, z1.b\n"
+ "zip2 z27.b, z31.b, z29.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip1 z31.b, z31.b, z29.b\n"
+ "zip1 z29.b, z30.b, z28.b\n"
+ "ld1b { z18.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z17.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "zip2 z28.b, z30.b, z28.b\n"
+ "zip2 z22.b, z26.b, z24.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "zip1 z26.b, z26.b, z24.b\n"
+ "zip1 z24.b, z25.b, z23.b\n"
+ "zip2 z23.b, z25.b, z23.b\n"
+ "zip2 z3.b, z4.b, z2.b\n"
+ "zip1 z4.b, z4.b, z2.b\n"
+ "zip1 z2.b, z0.b, z1.b\n"
+ "zip2 z1.b, z0.b, z1.b\n"
+ "zip2 z30.b, z31.b, z29.b\n"
+ "zip1 z31.b, z31.b, z29.b\n"
+ "zip1 z29.b, z27.b, z28.b\n"
+ "zip2 z28.b, z27.b, z28.b\n"
+ "zip2 z25.b, z26.b, z24.b\n"
+ "zip1 z26.b, z26.b, z24.b\n"
+ "zip1 z24.b, z22.b, z23.b\n"
+ "zip2 z23.b, z22.b, z23.b\n"
+ "mov z0.d, z6.d\n"
+ "mov z27.d, z6.d\n"
+ "mov z22.d, z6.d\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 440f57ed00..257c4d44dc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,421 +41,461 @@ void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "mov x19, #0x1\n"
- "ldp x25, x24, [%x[inptrs], #0x30]\n"
- "orr x19, x19, #0x100\n"
- "ldp x23, x22, [%x[outptrs], #0x0]\n"
- "orr x19, x19, #0x10000\n"
- "dup z12.s, w19\n"
- "ldp x21, x20, [%x[outptrs], #0x10]\n"
- "mov x19, #0x0\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "whilelt p1.b, x19, %x[n_channels]\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "mov x13, #0x0\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "mov x20, #0x1\n"
+ "ptrue p1.b\n"
+ "ldp x24, x23, [%x[outptrs], #0x0]\n"
+ "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "orr x20, x20, #0x100\n"
+ "orr x20, x20, #0x10000\n"
+ "ld1b { z14.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z13.b }, p2/Z, [x11, x13]\n"
+ "dup z12.s, w20\n"
+ "mov x20, #0x0\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "ld1b { z11.b }, p2/Z, [x10, x13]\n"
+ "zip2 z10.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ld1b { z9.b }, p2/Z, [x9, x13]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "zip1 z11.b, z13.b, z9.b\n"
+ "zip2 z9.b, z13.b, z9.b\n"
+ "ld1b { z8.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z7.b }, p2/Z, [x27, x13]\n"
+ "zip2 z13.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ld1b { z6.b }, p2/Z, [x26, x13]\n"
+ "zip1 z11.b, z10.b, z9.b\n"
+ "zip2 z9.b, z10.b, z9.b\n"
+ "ld1b { z5.b }, p2/Z, [x25, x13]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "zip2 z4.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "ld1b { z3.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z2.b }, p2/Z, [x11, x13]\n"
+ "zip1 z6.b, z7.b, z5.b\n"
+ "zip2 z5.b, z7.b, z5.b\n"
+ "ld1b { z1.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z0.b }, p2/Z, [x9, x13]\n"
+ "zip2 z31.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ld1b { z30.b }, p2/Z, [x28, x13]\n"
+ "ld1b { z29.b }, p2/Z, [x27, x13]\n"
+ "zip1 z1.b, z2.b, z0.b\n"
+ "zip2 z0.b, z2.b, z0.b\n"
+ "ld1b { z28.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z27.b }, p2/Z, [x25, x13]\n"
+ "zip2 z26.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z29.b, z27.b\n"
+ "zip2 z27.b, z29.b, z27.b\n"
+ "ld1w { z10.s }, p1/Z, [%x[params]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z23.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z7.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "ld1rw { z22.s }, p1/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "zip1 z6.b, z4.b, z5.b\n"
+ "zip2 z5.b, z4.b, z5.b\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "zip2 z2.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "zip1 z1.b, z31.b, z0.b\n"
+ "zip2 z0.b, z31.b, z0.b\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z29.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z28.b, z26.b, z27.b\n"
+ "zip2 z27.b, z26.b, z27.b\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z4.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z26.d, z10.d\n"
"1:" // Loop
- "mov z7.s, #0x0\n"
- "ld1b { z19.b }, p1/Z, [x11, x19]\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "mov z6.s, #0x0\n"
- "ld1b { z18.b }, p1/Z, [x10, x19]\n"
- "ldp x11, x10, [%x[inptrs], #0x40]\n"
- "ld1b { z16.b }, p1/Z, [x9, x19]\n"
- "zip1 z21.b, z19.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x28, x19]\n"
- "zip2 z19.b, z19.b, z16.b\n"
- "ldp x9, x28, [%x[inptrs], #0x50]\n"
- "ld1b { z23.b }, p1/Z, [x27, x19]\n"
- "zip1 z16.b, z18.b, z17.b\n"
- "ld1b { z20.b }, p1/Z, [x26, x19]\n"
- "zip2 z18.b, z18.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x60]\n"
- "zip1 z5.b, z21.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x25, x19]\n"
- "zip2 z4.b, z21.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x24, x19]\n"
- "zip1 z29.b, z19.b, z18.b\n"
- "ldp x25, x24, [%x[inptrs], #0x70]\n"
- "zip2 z28.b, z19.b, z18.b\n"
- "ld1b { z22.b }, p1/Z, [x11, x19]\n"
- "zip1 z19.b, z23.b, z17.b\n"
- "ld1b { z21.b }, p1/Z, [x10, x19]\n"
- "zip2 z27.b, z23.b, z17.b\n"
- "ldp x11, x10, [%x[inptrs], #0x0]\n"
- "zip1 z18.b, z20.b, z16.b\n"
- "ld1b { z17.b }, p1/Z, [x9, x19]\n"
- "zip2 z20.b, z20.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x28, x19]\n"
- "zip1 z3.b, z19.b, z18.b\n"
- "ldp x9, x28, [%x[inptrs], #0x10]\n"
- "zip2 z2.b, z19.b, z18.b\n"
- "ld1b { z19.b }, p1/Z, [x27, x19]\n"
- "zip1 z26.b, z22.b, z17.b\n"
- "ld1b { z25.b }, p1/Z, [x26, x19]\n"
- "zip2 z24.b, z22.b, z17.b\n"
- "ldp x27, x26, [%x[inptrs], #0x20]\n"
- "zip1 z23.b, z21.b, z16.b\n"
- "ld1b { z18.b }, p1/Z, [x25, x19]\n"
- "zip2 z22.b, z21.b, z16.b\n"
- "ld1b { z21.b }, p1/Z, [x24, x19]\n"
- "zip1 z17.b, z27.b, z20.b\n"
- "ldp x25, x24, [%x[inptrs], #0x30]\n"
- "zip2 z16.b, z27.b, z20.b\n"
- "st1b { z29.b }, p2, [SP]\n"
- "zip1 z20.b, z19.b, z18.b\n"
- "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
- "zip2 z19.b, z19.b, z18.b\n"
- "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
- "zip1 z18.b, z25.b, z21.b\n"
- "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
- "zip2 z17.b, z25.b, z21.b\n"
- "ld1w { z1.s }, p2/Z, [%x[params]]\n"
- "zip1 z0.b, z26.b, z23.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z30.b, z26.b, z23.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
- "zip2 z16.b, z24.b, z22.b\n"
- "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
- "zip1 z28.b, z20.b, z18.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "zip2 z26.b, z20.b, z18.b\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
- "zip2 z16.b, z19.b, z17.b\n"
- "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
- "mov z24.d, z1.d\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "mov z22.d, z1.d\n"
- "mov z21.d, z1.d\n"
- "udot z1.s, z31.b, z5.b\n"
- "udot z22.s, z31.b, z3.b\n"
- "udot z7.s, z12.b, z3.b\n"
- "udot z1.s, z29.b, z3.b\n"
+ "mov z19.s, #0x0\n"
+ "udot z19.s, z12.b, z8.b\n"
+ "udot z10.s, z21.b, z14.b\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "udot z19.s, z12.b, z3.b\n"
+ "udot z31.s, z21.b, z8.b\n"
+ "incw x13, ALL, MUL #4\n"
+ "udot z10.s, z16.b, z8.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "movprfx z18, z19\n udot z18.s, z12.b, z30.b\n"
+ "udot z19.s, z12.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "udot z31.s, z16.b, z3.b\n"
+ "udot z10.s, z20.b, z3.b\n"
"ext z3.b, z3.b, z3.b, #0x1\n"
- "udot z22.s, z29.b, z0.b\n"
- "udot z7.s, z12.b, z0.b\n"
- "udot z1.s, z27.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "udot z22.s, z27.b, z28.b\n"
- "mov z20.d, z7.d\n"
- "udot z7.s, z12.b, z5.b\n"
- "udot z20.s, z12.b, z28.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "udot z21.s, z31.b, z3.b\n"
- "udot z6.s, z12.b, z3.b\n"
- "udot z24.s, z31.b, z5.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "udot z21.s, z29.b, z0.b\n"
- "udot z6.s, z12.b, z0.b\n"
- "udot z24.s, z29.b, z3.b\n"
- "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "udot z21.s, z27.b, z28.b\n"
- "mov z19.d, z6.d\n"
- "udot z24.s, z27.b, z0.b\n"
- "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
- "udot z6.s, z12.b, z5.b\n"
- "ld1b { z5.b }, p2/Z, [SP]\n"
- "udot z19.s, z12.b, z28.b\n"
- "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "mov z7.s, #0x0\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "udot z7.s, z12.b, z2.b\n"
- "mov z6.s, #0x0\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- "udot z7.s, z12.b, z30.b\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z17.d, z22.d, z23.d\n"
- "mov z20.d, z7.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "udot z7.s, z12.b, z4.b\n"
- "udot z20.s, z12.b, z26.b\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z1.s, z1.s, z8.s\n"
- "and z16.d, z21.d, z23.d\n"
+ "udot z4.s, z21.b, z14.b\n"
+ "udot z26.s, z21.b, z8.b\n"
+ "mov z17.s, #0x0\n"
+ "udot z17.s, z12.b, z8.b\n"
+ "udot z17.s, z12.b, z3.b\n"
+ "udot z31.s, z20.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "udot z4.s, z16.b, z8.b\n"
+ "udot z26.s, z16.b, z3.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "movprfx z16, z17\n udot z16.s, z12.b, z30.b\n"
+ "mov z19.s, #0x0\n"
+ "udot z17.s, z12.b, z14.b\n"
+ "ld1w { z14.s }, p1/Z, [%x[params]]\n"
+ "udot z4.s, z20.b, z3.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "udot z26.s, z20.b, z30.b\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "and z21.d, z10.d, z8.d\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ "udot z19.s, z12.b, z7.b\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "udot z19.s, z12.b, z2.b\n"
+ "and z16.d, z4.d, z8.d\n"
+ "and z20.d, z31.d, z8.d\n"
+ "movprfx z18, z19\n udot z18.s, z12.b, z29.b\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z22.s, z22.s, z8.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "addvl %x[params], %x[params], #16\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "mov z22.d, z1.d\n"
- "udot z22.s, z31.b, z2.b\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "udot z22.s, z29.b, z30.b\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "udot z1.s, z31.b, z4.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "udot z22.s, z27.b, z26.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- "udot z1.s, z29.b, z2.b\n"
+ "udot z19.s, z12.b, z13.b\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "udot z31.s, z21.b, z7.b\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "udot z10.s, z21.b, z13.b\n"
+ "udot z10.s, z16.b, z7.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ "udot z4.s, z21.b, z13.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "mov z17.s, #0x0\n"
+ "udot z26.s, z21.b, z7.b\n"
+ "udot z17.s, z12.b, z7.b\n"
+ "incw x20\n"
+ "udot z31.s, z16.b, z2.b\n"
+ "udot z10.s, z20.b, z2.b\n"
"ext z2.b, z2.b, z2.b, #0x1\n"
- "udot z24.s, z31.b, z4.b\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "udot z1.s, z27.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "udot z21.s, z31.b, z2.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "udot z24.s, z29.b, z2.b\n"
- "udot z6.s, z12.b, z2.b\n"
- "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "udot z21.s, z29.b, z30.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "udot z24.s, z27.b, z30.b\n"
- "udot z6.s, z12.b, z30.b\n"
- "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
- "and z17.d, z22.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "udot z21.s, z27.b, z26.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- "mov z19.d, z6.d\n"
- "udot z6.s, z12.b, z4.b\n"
- "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
- "udot z19.s, z12.b, z26.b\n"
- "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "mov z7.s, #0x0\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "udot z7.s, z12.b, z3.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mov z6.s, #0x0\n"
- "udot z7.s, z12.b, z0.b\n"
- "and z16.d, z1.d, z23.d\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "udot z4.s, z16.b, z7.b\n"
+ "udot z26.s, z16.b, z2.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "udot z17.s, z12.b, z2.b\n"
+ "udot z31.s, z20.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "udot z4.s, z20.b, z2.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "udot z26.s, z20.b, z29.b\n"
+ "movprfx z16, z17\n udot z16.s, z12.b, z29.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "udot z17.s, z12.b, z13.b\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "mov z20.d, z7.d\n"
- "udot z7.s, z12.b, z5.b\n"
- "udot z20.s, z12.b, z28.b\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params]]\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "and z16.d, z21.d, z23.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "mov z19.s, #0x0\n"
+ "udot z19.s, z12.b, z6.b\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "udot z19.s, z12.b, z1.b\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "udot z31.s, z21.b, z6.b\n"
+ "movprfx z18, z19\n udot z18.s, z12.b, z28.b\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "udot z10.s, z21.b, z11.b\n"
+ "udot z10.s, z16.b, z6.b\n"
+ "udot z19.s, z12.b, z11.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z4.s, z21.b, z11.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "mov z17.s, #0x0\n"
+ "udot z26.s, z21.b, z6.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "udot z17.s, z12.b, z6.b\n"
+ "udot z31.s, z16.b, z1.b\n"
+ "incw x20\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "udot z10.s, z20.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "udot z4.s, z16.b, z6.b\n"
+ "udot z26.s, z16.b, z1.b\n"
+ "udot z17.s, z12.b, z1.b\n"
+ "udot z31.s, z20.b, z28.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "udot z4.s, z20.b, z1.b\n"
+ "udot z26.s, z20.b, z28.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "movprfx z16, z17\n udot z16.s, z12.b, z28.b\n"
+ "udot z17.s, z12.b, z11.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "add z22.s, z22.s, z8.s\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z1.s, z1.s, z8.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "mov z22.d, z1.d\n"
- "udot z22.s, z31.b, z3.b\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "udot z1.s, z31.b, z5.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "udot z22.s, z29.b, z0.b\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params]]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "mov z19.s, #0x0\n"
+ "udot z19.s, z12.b, z5.b\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "mov z4.d, z10.d\n"
+ "udot z19.s, z12.b, z0.b\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "mov z31.d, z10.d\n"
+ "udot z31.s, z21.b, z5.b\n"
+ "movprfx z18, z19\n udot z18.s, z12.b, z27.b\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "mov z26.d, z10.d\n"
+ "udot z10.s, z21.b, z9.b\n"
+ "udot z10.s, z16.b, z5.b\n"
+ "udot z19.s, z12.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "udot z4.s, z21.b, z9.b\n"
"ext z5.b, z5.b, z5.b, #0x1\n"
- "udot z1.s, z29.b, z3.b\n"
- "udot z22.s, z27.b, z28.b\n"
- "ext z3.b, z3.b, z3.b, #0x1\n"
- "ext z28.b, z28.b, z28.b, #0x1\n"
- "udot z24.s, z31.b, z5.b\n"
- "udot z1.s, z27.b, z0.b\n"
+ "mov z17.s, #0x0\n"
+ "udot z26.s, z21.b, z5.b\n"
+ "ld1w { z8.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "udot z17.s, z12.b, z5.b\n"
+ "udot z31.s, z16.b, z0.b\n"
+ "incw x20\n"
+ "whilelt p0.s, x20, %x[n_channels]\n"
+ "udot z10.s, z20.b, z0.b\n"
"ext z0.b, z0.b, z0.b, #0x1\n"
- "udot z21.s, z31.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "udot z24.s, z29.b, z3.b\n"
- "udot z6.s, z12.b, z3.b\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "udot z21.s, z29.b, z0.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "udot z24.s, z27.b, z0.b\n"
- "udot z6.s, z12.b, z0.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "udot z21.s, z27.b, z28.b\n"
- "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "mov z7.s, #0x0\n"
- "mov z19.d, z6.d\n"
- "udot z6.s, z12.b, z5.b\n"
- "udot z19.s, z12.b, z28.b\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "udot z7.s, z12.b, z2.b\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mov z6.s, #0x0\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z17.d, z22.d, z23.d\n"
- "and z16.d, z21.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "udot z7.s, z12.b, z30.b\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "udot z4.s, z16.b, z5.b\n"
+ "whilelt p2.b, x13, %x[n_channels]\n"
+ "udot z26.s, z16.b, z0.b\n"
+ "udot z17.s, z12.b, z0.b\n"
+ "ld1b { z13.b }, p2/Z, [x11, x13]\n"
+ "ld1b { z11.b }, p2/Z, [x10, x13]\n"
+ "udot z31.s, z20.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "mls z10.s, p1/M, z19.s, z23.s\n"
+ "ld1b { z7.b }, p2/Z, [x27, x13]\n"
+ "udot z4.s, z20.b, z0.b\n"
+ "udot z26.s, z20.b, z27.b\n"
+ ".inst 0x04ae754a // sqrdmulh z10.s, z10.s, z14.s\n"
+ "ld1b { z6.b }, p2/Z, [x26, x13]\n"
+ "movprfx z16, z17\n udot z16.s, z12.b, z27.b\n"
+ "udot z17.s, z12.b, z9.b\n"
+ "and z21.d, z10.d, z8.d\n"
+ "ld1b { z9.b }, p2/Z, [x9, x13]\n"
+ "mls z4.s, p1/M, z17.s, z23.s\n"
+ "mls z31.s, p1/M, z18.s, z23.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "ld1b { z5.b }, p2/Z, [x25, x13]\n"
+ "mls z26.s, p1/M, z16.s, z23.s\n"
+ ".inst 0x04ae7484 // sqrdmulh z4.s, z4.s, z14.s\n"
+ ".inst 0x04ae77ff // sqrdmulh z31.s, z31.s, z14.s\n"
+ ".inst 0x04ae775a // sqrdmulh z26.s, z26.s, z14.s\n"
+ "ld1b { z14.b }, p2/Z, [x12, x13]\n"
+ "ldp x12, x11, [%x[inptrs], #0x40]\n"
+ "sqadd z10.s, z10.s, z21.s\n"
+ "and z16.d, z4.d, z8.d\n"
+ ".inst 0x4482850a // srshl z10.s, p1/M, z10.s, z8.s\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z8.d\n"
+ "and z21.d, z26.d, z8.d\n"
+ "ld1b { z3.b }, p2/Z, [x12, x13]\n"
+ "ld1b { z2.b }, p2/Z, [x11, x13]\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "add z1.s, z1.s, z8.s\n"
- "mov z20.d, z7.d\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "udot z7.s, z12.b, z4.b\n"
- "udot z20.s, z12.b, z26.b\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "add z22.s, z22.s, z8.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "ld1b { z1.b }, p2/Z, [x10, x13]\n"
+ "ld1b { z0.b }, p2/Z, [x9, x13]\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z16.s\n"
+ ".inst 0x44828504 // srshl z4.s, p1/M, z4.s, z8.s\n"
+ "ld1b { z16.b }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z26.s, z26.s, z21.s\n"
+ ".inst 0x4482851f // srshl z31.s, p1/M, z31.s, z8.s\n"
+ ".inst 0x4482851a // srshl z26.s, p1/M, z26.s, z8.s\n"
+ "add z10.s, z10.s, z22.s\n"
+ "smax z10.s, p1/M, z10.s, z25.s\n"
+ "add z4.s, z4.s, z22.s\n"
+ "ld1b { z8.b }, p2/Z, [x28, x13]\n"
+ "add z31.s, z31.s, z22.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldp x26, x25, [%x[inptrs], #0x70]\n"
+ "smin z10.s, p1/M, z10.s, z24.s\n"
+ "smax z4.s, p1/M, z4.s, z25.s\n"
+ "st1b { z10.s }, p0, [x24, x20]\n"
+ "ld1b { z30.b }, p2/Z, [x28, x13]\n"
+ "smax z31.s, p1/M, z31.s, z25.s\n"
+ "smax z26.s, p1/M, z26.s, z25.s\n"
+ "ld1b { z29.b }, p2/Z, [x27, x13]\n"
+ "ld1b { z28.b }, p2/Z, [x26, x13]\n"
+ "ld1b { z27.b }, p2/Z, [x25, x13]\n"
+ "zip2 z10.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "smin z4.s, p1/M, z4.s, z24.s\n"
+ "zip1 z11.b, z13.b, z9.b\n"
+ "zip2 z9.b, z13.b, z9.b\n"
+ "smin z31.s, p1/M, z31.s, z24.s\n"
+ "smin z26.s, p1/M, z26.s, z24.s\n"
+ "st1b { z4.s }, p0, [x23, x20]\n"
+ "zip2 z13.b, z14.b, z11.b\n"
+ "zip1 z14.b, z14.b, z11.b\n"
+ "ldp x12, x11, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p0, [x22, x20]\n"
+ "zip1 z11.b, z10.b, z9.b\n"
+ "zip2 z9.b, z10.b, z9.b\n"
+ "ld1w { z10.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z26.s }, p0, [x21, x20]\n"
+ "zip2 z4.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "incw x20\n"
+ "zip1 z6.b, z7.b, z5.b\n"
+ "zip2 z5.b, z7.b, z5.b\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x30]\n"
+ "ld1b { z21.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z1.b, z2.b, z0.b\n"
+ "zip2 z0.b, z2.b, z0.b\n"
+ "ld1b { z20.b }, p1/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "add z21.s, z21.s, z8.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "mov z24.d, z1.d\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "mov z22.d, z1.d\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "mov z21.d, z1.d\n"
- "incw x19\n"
- "udot z1.s, z31.b, z4.b\n"
- "whilelt p0.s, x19, %x[n_channels]\n"
- "udot z22.s, z31.b, z2.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "udot z1.s, z29.b, z2.b\n"
- "udot z22.s, z29.b, z30.b\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "udot z24.s, z31.b, z4.b\n"
- "udot z1.s, z27.b, z30.b\n"
- "udot z22.s, z27.b, z26.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "ext z26.b, z26.b, z26.b, #0x1\n"
- "udot z21.s, z31.b, z2.b\n"
- "udot z24.s, z29.b, z2.b\n"
- "udot z6.s, z12.b, z2.b\n"
- "mls z1.s, p2/M, z7.s, z9.s\n"
- "udot z21.s, z29.b, z30.b\n"
- "udot z24.s, z27.b, z30.b\n"
- "udot z6.s, z12.b, z30.b\n"
- ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
- "udot z21.s, z27.b, z26.b\n"
- "mls z22.s, p2/M, z20.s, z9.s\n"
- "mov z19.d, z6.d\n"
- "udot z6.s, z12.b, z4.b\n"
- "udot z19.s, z12.b, z26.b\n"
- "and z16.d, z1.d, z23.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
- "mls z24.s, p2/M, z6.s, z9.s\n"
- "mls z21.s, p2/M, z19.s, z9.s\n"
- ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
- "and z17.d, z22.d, z23.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
- "and z18.d, z24.d, z23.d\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z16.d, z21.d, z23.d\n"
- ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z22.s, z22.s, z17.s\n"
- "add z1.s, z1.s, z8.s\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "smax z1.s, p2/M, z1.s, z11.s\n"
- ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
- "add z22.s, z22.s, z8.s\n"
- "smin z1.s, p2/M, z1.s, z10.s\n"
- "st1b { z1.s }, p0, [x23, x19]\n"
- "add z24.s, z24.s, z8.s\n"
- "smax z22.s, p2/M, z22.s, z11.s\n"
- ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
- "smax z24.s, p2/M, z24.s, z11.s\n"
- "smin z22.s, p2/M, z22.s, z10.s\n"
- "st1b { z22.s }, p0, [x21, x19]\n"
- "add z21.s, z21.s, z8.s\n"
- "smin z24.s, p2/M, z24.s, z10.s\n"
- "st1b { z24.s }, p0, [x22, x19]\n"
- "smax z21.s, p2/M, z21.s, z11.s\n"
- "smin z21.s, p2/M, z21.s, z10.s\n"
- "st1b { z21.s }, p0, [x20, x19]\n"
- "incw x19\n"
- "whilelt p1.b, x19, %x[n_channels]\n"
+ "zip2 z26.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z29.b, z27.b\n"
+ "zip2 z27.b, z29.b, z27.b\n"
+ "zip2 z7.b, z8.b, z6.b\n"
+ "zip1 z8.b, z8.b, z6.b\n"
+ "zip1 z6.b, z4.b, z5.b\n"
+ "zip2 z5.b, z4.b, z5.b\n"
+ "zip2 z2.b, z3.b, z1.b\n"
+ "zip1 z3.b, z3.b, z1.b\n"
+ "zip1 z1.b, z31.b, z0.b\n"
+ "zip2 z0.b, z31.b, z0.b\n"
+ "zip2 z29.b, z30.b, z28.b\n"
+ "zip1 z30.b, z30.b, z28.b\n"
+ "zip1 z28.b, z26.b, z27.b\n"
+ "zip2 z27.b, z26.b, z27.b\n"
+ "mov z4.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z26.d, z10.d\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 7bfa5fc4c7..386eb96cff 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,324 +91,316 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x8, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x8\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x15, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x14, #0x0\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z12.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z18.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z13.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x15, x17\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x15, x17\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "mov x19, x15\n"
- "incw x19\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "whilelt p1.s, x19, x17\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z17.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z17.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "mov z9.d, z11.d\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
+ "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x15, x14, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x13, x12, [x24, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x23, x17\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
"ld1b { z0.h }, p4/Z, [x16]\n"
- ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n"
- "mov z20.d, z17.d\n"
"ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "mov z24.d, z11.d\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
"ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n"
- "mov z19.d, z17.d\n"
"ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- "mov z26.d, z11.d\n"
+ ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
"ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n"
- "mov z23.d, z17.d\n"
"ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n"
+ ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
"ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n"
"inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
"ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n"
- ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n"
- ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n"
- "ldr x19, [x12, #0x20]\n"
- "ld1b { z31.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- "ld1b { z30.h }, p3/Z, [x22, x15]\n"
- "ld1b { z29.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x20, x15]\n"
- "ld1b { z27.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "mov z26.d, z13.d\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z24.d, z13.d\n"
+ "ld1b { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z16.d, z17.d\n"
+ "mov z25.d, z13.d\n"
+ "ld1b { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
+ "mov z9.d, z17.d\n"
+ ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
"1:" // Loop
- ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n"
- "ldr x21, [x12, #0x28]\n"
- "whilelt p0.h, x14, x17\n"
+ ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
- "ldr x20, [x12, #0x30]\n"
- "inch x16\n"
- ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n"
- "ldr x26, [x12, #0x38]\n"
- ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n"
- "ldr x25, [x12, #0x40]\n"
- ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
- "ldr x19, [x12, #0x48]\n"
- ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n"
- "ldr x24, [x12, #0x50]\n"
- ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n"
- "ldr x23, [x12, #0x58]\n"
- ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x12, #0x60]\n"
+ "ldr x22, [x11, #0x28]\n"
+ "ldr x27, [x11, #0x38]\n"
+ ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ "ldr x21, [x11, #0x30]\n"
+ "ldr x26, [x11, #0x40]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
- ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
- "ldr x21, [x12, #0x68]\n"
- ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x15]\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
- "ldr x20, [x12, #0x70]\n"
+ "ldr x20, [x11, #0x48]\n"
+ "ld1b { z30.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
+ "ldr x25, [x11, #0x50]\n"
+ "ldr x24, [x11, #0x58]\n"
+ ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
- "ldr x19, [x12, #0x78]\n"
- ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n"
- "ld1w { z25.s }, p2/Z, [x13]\n"
- ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n"
- "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
- "addvl x13, x13, #2\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ "ldr x23, [x11, #0x60]\n"
+ ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ "ldr x22, [x11, #0x68]\n"
+ "ldr x21, [x11, #0x70]\n"
".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n"
- ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n"
- "uzp1 z10.s, z25.s, z16.s\n"
- "uzp2 z22.s, z25.s, z16.s\n"
- "ld1w { z25.s }, p2/Z, [x11]\n"
- ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x26, x15]\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
- "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
- ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x25, x15]\n"
- "addvl x11, x11, #2\n"
- ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- "uzp1 z21.s, z25.s, z16.s\n"
- "uzp2 z25.s, z25.s, z16.s\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ldr x20, [x11, #0x78]\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n"
- ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n"
- ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z19.s, z20.s, z18.s\n"
+ ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
+ "uzp2 z22.s, z20.s, z18.s\n"
+ "ld1w { z20.s }, p2/Z, [x28]\n"
+ ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
+ "ld1b { z31.h }, p3/Z, [x26, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
+ "whilelt p0.h, x10, x17\n"
+ "inch x16\n"
+ ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x24, x15]\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x8]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x15]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n"
+ "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "addvl x28, x28, #2\n"
+ ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
+ "ld1b { z31.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
- ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
+ "uzp1 z1.s, z20.s, z18.s\n"
+ ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
+ "uzp2 z27.s, z20.s, z18.s\n"
+ ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
- ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x15]\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n"
- "ld1b { z28.h }, p3/Z, [x19, x15]\n"
- "inch x15\n"
- ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
- "whilelt p2.s, x15, x17\n"
- ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
- "mov x19, x15\n"
+ ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n"
- "incw x19\n"
- ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n"
- "whilelt p1.s, x19, x17\n"
- ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n"
- "whilelt p3.h, x15, x17\n"
- ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
- ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n"
- ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
- "and z16.d, z11.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z1.d, z17.d, z25.d\n"
- "and z27.d, z9.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n"
- ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n"
- "asr z27.s, z27.s, #0x1f\n"
- ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n"
- "sqadd z11.s, z11.s, z16.s\n"
+ ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "inch x8\n"
+ ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
- "and z16.d, z20.d, z25.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z17.s, z17.s, z1.s\n"
- "sqadd z9.s, z9.s, z27.s\n"
- ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n"
+ "and z21.d, z13.d, z1.d\n"
+ "mov x20, x8\n"
+ ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ "incw x20\n"
+ ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
- "sqadd z20.s, z20.s, z16.s\n"
- ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n"
- ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n"
- ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
- ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n"
- "and z16.d, z24.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z7.d, z19.d, z25.d\n"
- "and z3.d, z26.d, z21.d\n"
- "asr z7.s, z7.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
- "add z11.s, z11.s, z15.s\n"
- "add z17.s, z17.s, z15.s\n"
- "sqadd z19.s, z19.s, z7.s\n"
- "add z9.s, z9.s, z15.s\n"
- "sqadd z26.s, z26.s, z3.s\n"
- "and z16.d, z23.d, z25.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "smin z11.s, p4/M, z11.s, z14.s\n"
- "smin z17.s, p4/M, z17.s, z14.s\n"
- "smin z9.s, p4/M, z9.s, z14.s\n"
- ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n"
- ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n"
- "smax z11.s, p4/M, z11.s, z13.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "add z20.s, z20.s, z15.s\n"
- "add z24.s, z24.s, z15.s\n"
- "smax z17.s, p4/M, z17.s, z13.s\n"
- "smax z9.s, p4/M, z9.s, z13.s\n"
- "smin z20.s, p4/M, z20.s, z14.s\n"
- "smin z24.s, p4/M, z24.s, z14.s\n"
- "trn1 z11.h, z11.h, z17.h\n"
- "st1b { z11.h }, p0, [x10, x14]\n"
- "smax z20.s, p4/M, z20.s, z13.s\n"
- ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n"
- "smax z24.s, p4/M, z24.s, z13.s\n"
- ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n"
- ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n"
- "trn1 z9.h, z9.h, z20.h\n"
- "st1b { z9.h }, p0, [x9, x14]\n"
- "add z19.s, z19.s, z15.s\n"
- "add z26.s, z26.s, z15.s\n"
- "add z23.s, z23.s, z15.s\n"
- "smin z19.s, p4/M, z19.s, z14.s\n"
- "smin z26.s, p4/M, z26.s, z14.s\n"
- "smin z23.s, p4/M, z23.s, z14.s\n"
- "smax z19.s, p4/M, z19.s, z13.s\n"
- "smax z26.s, p4/M, z26.s, z13.s\n"
- "smax z23.s, p4/M, z23.s, z13.s\n"
- "trn1 z24.h, z24.h, z19.h\n"
- "st1b { z24.h }, p0, [x28, x14]\n"
- "trn1 z26.h, z26.h, z23.h\n"
- "st1b { z26.h }, p0, [x27, x14]\n"
- "inch x14\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z17.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z17.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z17.s, z17.s, z16.s\n"
- "mov z9.d, z11.d\n"
+ "and z20.d, z17.d, z27.d\n"
+ "whilelt p1.s, x20, x17\n"
+ ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
+ "sqadd z13.s, z13.s, z21.s\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z26.d, z1.d\n"
+ ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
+ "and z18.d, z24.d, z1.d\n"
+ ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
+ "and z21.d, z25.d, z1.d\n"
+ ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z10.d, z27.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z22.d, z16.d, z27.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z20.d, z9.d, z27.d\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
+ "sqadd z25.s, z25.s, z21.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ "sqadd z10.s, z10.s, z2.s\n"
+ "sqadd z16.s, z16.s, z22.s\n"
+ ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z9.s, z9.s, z20.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
+ ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
+ ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
+ ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
+ ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z12.h\n"
+ "smin z13.h, p4/M, z13.h, z11.h\n"
+ "sqadd z26.h, z26.h, z14.h\n"
+ "sqadd z24.h, z24.h, z14.h\n"
+ "smax z26.h, p4/M, z26.h, z12.h\n"
+ "smax z24.h, p4/M, z24.h, z12.h\n"
+ "sqadd z25.h, z25.h, z14.h\n"
+ "smax z25.h, p4/M, z25.h, z12.h\n"
+ "smin z26.h, p4/M, z26.h, z11.h\n"
+ "st1b { z13.h }, p0, [x15, x10]\n"
+ "smin z24.h, p4/M, z24.h, z11.h\n"
+ "smin z25.h, p4/M, z25.h, z11.h\n"
+ "st1b { z26.h }, p0, [x14, x10]\n"
+ "st1b { z24.h }, p0, [x13, x10]\n"
+ "st1b { z25.h }, p0, [x12, x10]\n"
"ld1b { z0.h }, p4/Z, [x16]\n"
- ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n"
- "mov z20.d, z17.d\n"
"ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
- "mov z24.d, z11.d\n"
+ "inch x10\n"
"ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
- ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n"
- "mov z19.d, z17.d\n"
"ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
- "mov z26.d, z11.d\n"
+ ".inst 0x454f1800 // usublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1821 // usublb z1.h, z1.b, z15.b\n"
"ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
- ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n"
- "mov z23.d, z17.d\n"
"ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
- ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n"
+ ".inst 0x454f1842 // usublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1863 // usublb z3.h, z3.b, z15.b\n"
"ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
"ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
- ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n"
"inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1884 // usublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
"ld1b { z8.h }, p4/Z, [x16]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n"
- ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n"
- ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n"
- "ldr x19, [x12, #0x20]\n"
- "ld1b { z31.h }, p3/Z, [x23, x15]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- "ld1b { z30.h }, p3/Z, [x22, x15]\n"
- "ld1b { z29.h }, p3/Z, [x21, x15]\n"
- ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
- "ld1b { z28.h }, p3/Z, [x20, x15]\n"
- "ld1b { z27.h }, p3/Z, [x19, x15]\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z26.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ld1b { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z24.d, z13.d\n"
+ "mov z16.d, z17.d\n"
+ "ld1b { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
+ "mov z25.d, z13.d\n"
+ "mov z9.d, z17.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
+ ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f18e7 // usublb z7.h, z7.b, z15.b\n"
+ ".inst 0x454f1908 // usublb z8.h, z8.b, z15.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index e1b2d257b0..9f21401840 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,356 +100,348 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ptrue p4.b\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
"mov x7, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x8, #0x0\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x16, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z19.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
"ld1rb { z12.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z20.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x7, x5\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x7, x5\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x19, x7\n"
- "incw x19\n"
- "ldp x12, x11, [x21, #0x10]\n"
- "whilelt p1.s, x19, x5\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z18.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z18.s, z16.s\n"
- "mov z11.d, z13.d\n"
- "ld1b { z0.h }, p4/Z, [x6]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z16.h }, p4/Z, [x21]\n"
+ "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z0.h }, p4/Z, [x17]\n"
+ "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- "mov z9.d, z16.d\n"
- "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
- "mov z18.d, z13.d\n"
- "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "mov z10.d, z16.d\n"
- "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
- "mov z22.d, z13.d\n"
- "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- "mov z23.d, z16.d\n"
- "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "inch x6, ALL, MUL #8\n"
- "ld1b { z8.h }, p4/Z, [x6]\n"
- "ldp x26, x25, [x16, #0x0]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1b { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z9.d, z13.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z11.d, z13.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z22.d, z17.d\n"
+ "mov z21.d, z13.d\n"
+ "ld1b { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z18.d, z17.d\n"
".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- "ldp x24, x23, [x16, #0x10]\n"
".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x26, x7]\n"
- ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n"
- "ld1b { z30.h }, p3/Z, [x25, x7]\n"
- "ld1b { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n"
- "ld1b { z28.h }, p3/Z, [x23, x7]\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
- "ld1b { z25.h }, p3/Z, [x20, x7]\n"
- "ld1b { z24.h }, p3/Z, [x19, x7]\n"
- ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
- ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
- ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
- ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
"1:" // Loop
".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- "ldr x23, [x16, #0x40]\n"
- "whilelt p0.h, x8, x5\n"
- ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n"
- "ldr x22, [x16, #0x48]\n"
- "inch x6\n"
- ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
- "ldr x21, [x16, #0x50]\n"
- ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n"
- "ldr x20, [x16, #0x58]\n"
- ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n"
- "ldr x19, [x16, #0x60]\n"
- ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
- "ldr x10, [x16, #0x68]\n"
- ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n"
- "ldr x9, [x16, #0x70]\n"
- ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
- "ldr x28, [x16, #0x78]\n"
+ ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
+ "ldr x25, [x11, #0x40]\n"
+ "ldr x24, [x11, #0x48]\n"
+ ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
+ "ldr x22, [x11, #0x50]\n"
+ "ldr x20, [x11, #0x58]\n"
".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- "ldr x27, [x16, #0x80]\n"
- ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n"
- "ldr x26, [x16, #0x88]\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
- "ldr x25, [x16, #0x90]\n"
- ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ldr x23, [x11, #0x78]\n"
+ "ldr x21, [x11, #0x60]\n"
+ ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- "ldr x24, [x16, #0x98]\n"
- ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
- "ldr x23, [x16, #0xa0]\n"
- ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- "ldr x22, [x16, #0xa8]\n"
- ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
"ld1b { z26.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
+ "ldr x22, [x11, #0x80]\n"
+ "ldr x20, [x11, #0x68]\n"
".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- "ldr x21, [x16, #0xb0]\n"
- ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x19, x7]\n"
- ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
+ "ldr x21, [x11, #0x88]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- "ldr x20, [x16, #0xb8]\n"
- ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- "ldr x19, [x16, #0xc0]\n"
- ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
- "ld1w { z21.s }, p2/Z, [x17]\n"
- ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n"
- "ld1b { z24.h }, p3/Z, [x9, x7]\n"
- ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
- ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x10, x7]\n"
- "addvl x17, x17, #2\n"
+ ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
+ "ldr x20, [x11, #0x70]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
+ "ldr x25, [x11, #0x98]\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- "uzp1 z30.s, z21.s, z17.s\n"
- "uzp2 z31.s, z21.s, z17.s\n"
- "ld1w { z21.s }, p2/Z, [x15]\n"
- ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
- "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
- "addvl x15, x15, #2\n"
- ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x7]\n"
- ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
- ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n"
- ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
- "ld1b { z27.h }, p3/Z, [x28, x7]\n"
- ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
- ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n"
- ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x7]\n"
- ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
- ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n"
- ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x7]\n"
- ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
- "uzp1 z0.s, z21.s, z17.s\n"
- "uzp2 z21.s, z21.s, z17.s\n"
- ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n"
- ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
- ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n"
- ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n"
- ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ "ldr x24, [x11, #0x90]\n"
+ ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
"ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
- "and z4.d, z13.d, z0.d\n"
- "and z17.d, z16.d, z21.d\n"
- "asr z4.s, z4.s, #0x1f\n"
- ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
- ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n"
- ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x7]\n"
- ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
- "sqadd z13.s, z13.s, z4.s\n"
- "sqadd z16.s, z16.s, z17.s\n"
- ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n"
- ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
+ "ldr x23, [x11, #0xa8]\n"
+ "ldr x20, [x11, #0xa0]\n"
+ ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
"ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
- ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n"
- ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
+ "ldr x22, [x11, #0xb0]\n"
+ "ldr x21, [x11, #0xb8]\n"
+ ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z31.s }, p2/Z, [x27]\n"
+ ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
+ "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "uzp1 z19.s, z31.s, z20.s\n"
+ ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
+ "uzp2 z30.s, z31.s, z20.s\n"
+ "ld1w { z31.s }, p2/Z, [x26]\n"
+ ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
+ "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z1.s, z31.s, z20.s\n"
+ ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
+ "uzp2 z31.s, z31.s, z20.s\n"
+ "inch x17\n"
+ ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
+ "and z0.d, z13.d, z1.d\n"
"inch x7\n"
- ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n"
- "whilelt p2.s, x7, x5\n"
- ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
- "mov x19, x7\n"
- ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n"
- "incw x19\n"
- ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n"
- "whilelt p1.s, x19, x5\n"
- "and z1.d, z11.d, z0.d\n"
- "whilelt p3.h, x7, x5\n"
- "and z17.d, z9.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n"
- ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n"
- ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
- ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n"
- "sqadd z11.s, z11.s, z1.s\n"
- "sqadd z9.s, z9.s, z17.s\n"
- "add z13.s, z13.s, z14.s\n"
- ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n"
- ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n"
- ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n"
- "and z17.d, z18.d, z0.d\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n"
- ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n"
- ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n"
- "smin z13.s, p4/M, z13.s, z15.s\n"
- ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
- "and z1.d, z10.d, z21.d\n"
- "asr z1.s, z1.s, #0x1f\n"
- "add z16.s, z16.s, z14.s\n"
- "sqadd z18.s, z18.s, z17.s\n"
+ ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ "mov x20, x7\n"
+ ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
+ "and z20.d, z17.d, z31.d\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "whilelt p3.h, x7, x8\n"
+ "sqadd z13.s, z13.s, z0.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "addvl x27, x27, #2\n"
+ "and z19.d, z9.d, z1.d\n"
+ ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
+ "addvl x26, x26, #2\n"
+ "and z2.d, z11.d, z1.d\n"
".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
- "smax z13.s, p4/M, z13.s, z20.s\n"
- "smin z16.s, p4/M, z16.s, z15.s\n"
- "sqadd z10.s, z10.s, z1.s\n"
- "and z2.d, z22.d, z0.d\n"
+ "and z0.d, z21.d, z1.d\n"
+ ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
+ "and z3.d, z10.d, z31.d\n"
"asr z2.s, z2.s, #0x1f\n"
- ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
- "smax z16.s, p4/M, z16.s, z20.s\n"
- ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
- ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
- ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n"
- "trn1 z13.h, z13.h, z16.h\n"
- "st1b { z13.h }, p0, [x14, x8]\n"
- "add z11.s, z11.s, z14.s\n"
- "add z9.s, z9.s, z14.s\n"
- "add z18.s, z18.s, z14.s\n"
- "sqadd z22.s, z22.s, z2.s\n"
- "and z16.d, z23.d, z21.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "smin z11.s, p4/M, z11.s, z15.s\n"
- "smin z9.s, p4/M, z9.s, z15.s\n"
- "smin z18.s, p4/M, z18.s, z15.s\n"
- ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n"
- ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n"
- "smax z11.s, p4/M, z11.s, z20.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "add z10.s, z10.s, z14.s\n"
- "add z22.s, z22.s, z14.s\n"
- "smax z9.s, p4/M, z9.s, z20.s\n"
- "smax z18.s, p4/M, z18.s, z20.s\n"
- "smin z10.s, p4/M, z10.s, z15.s\n"
- "smin z22.s, p4/M, z22.s, z15.s\n"
- "trn1 z11.h, z11.h, z9.h\n"
- "st1b { z11.h }, p0, [x13, x8]\n"
- "smax z10.s, p4/M, z10.s, z20.s\n"
- ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n"
- "smax z22.s, p4/M, z22.s, z20.s\n"
- "trn1 z18.h, z18.h, z10.h\n"
- "st1b { z18.h }, p0, [x12, x8]\n"
- "add z23.s, z23.s, z14.s\n"
- "smin z23.s, p4/M, z23.s, z15.s\n"
- "smax z23.s, p4/M, z23.s, z20.s\n"
- "trn1 z22.h, z22.h, z23.h\n"
- "st1b { z22.h }, p0, [x11, x8]\n"
- "inch x8\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z18.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z18.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z18.s, z16.s\n"
- "mov z11.d, z13.d\n"
- "ld1b { z0.h }, p4/Z, [x6]\n"
+ "and z26.d, z22.d, z31.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z20.d, z18.d, z31.d\n"
+ "sqadd z9.s, z9.s, z19.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z2.s\n"
+ ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z0.s\n"
+ ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
+ "sqadd z22.s, z22.s, z26.s\n"
+ "sqadd z18.s, z18.s, z20.s\n"
+ ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
+ ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
+ ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
+ ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "sqadd z9.h, z9.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z16.h\n"
+ "smax z9.h, p4/M, z9.h, z16.h\n"
+ "sqadd z11.h, z11.h, z14.h\n"
+ "sqadd z21.h, z21.h, z14.h\n"
+ "smax z11.h, p4/M, z11.h, z16.h\n"
+ "smax z21.h, p4/M, z21.h, z16.h\n"
+ "smin z13.h, p4/M, z13.h, z15.h\n"
+ "smin z9.h, p4/M, z9.h, z15.h\n"
+ "st1b { z13.h }, p0, [x16, x10]\n"
+ "smin z11.h, p4/M, z11.h, z15.h\n"
+ "smin z21.h, p4/M, z21.h, z15.h\n"
+ "st1b { z9.h }, p0, [x15, x10]\n"
+ "st1b { z11.h }, p0, [x14, x10]\n"
+ "st1b { z21.h }, p0, [x13, x10]\n"
+ "ld1b { z0.h }, p4/Z, [x17]\n"
+ "ld1b { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1b { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- "mov z9.d, z16.d\n"
- "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
- "mov z18.d, z13.d\n"
- "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
- "mov z10.d, z16.d\n"
- "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
- "mov z22.d, z13.d\n"
- "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
- "mov z23.d, z16.d\n"
- "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- "inch x6, ALL, MUL #8\n"
- "ld1b { z8.h }, p4/Z, [x6]\n"
- "ldp x26, x25, [x16, #0x0]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1b { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z9.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z11.d, z13.d\n"
+ "mov z22.d, z17.d\n"
+ "ld1b { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z21.d, z13.d\n"
+ "mov z18.d, z17.d\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
- "ldp x24, x23, [x16, #0x10]\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
- "ldp x22, x21, [x16, #0x20]\n"
- "ldp x20, x19, [x16, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x26, x7]\n"
- ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n"
- "ld1b { z30.h }, p3/Z, [x25, x7]\n"
- "ld1b { z29.h }, p3/Z, [x24, x7]\n"
- ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n"
- "ld1b { z28.h }, p3/Z, [x23, x7]\n"
- "ld1b { z27.h }, p3/Z, [x22, x7]\n"
- ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
- "ld1b { z26.h }, p3/Z, [x21, x7]\n"
- ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
- "ld1b { z25.h }, p3/Z, [x20, x7]\n"
- "ld1b { z24.h }, p3/Z, [x19, x7]\n"
- ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
- ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
- ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
- ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 0b2182f995..40e2f5df25 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,546 +111,538 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x0, #0x0\n"
+ "mov x24, x0\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
- "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x2, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x3, #0x0\n"
- "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z17.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z13.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z5.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x2, x0\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x2, x0\n"
- "ldp x7, x8, [x21, #0x0]\n"
- "mov x19, x2\n"
- "incw x19\n"
- "ldp x17, x16, [x21, #0x10]\n"
- "whilelt p1.s, x19, x0\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z19.s }, p2/Z, [x19]\n"
- "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z19.s, z6.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z19.s, z6.s\n"
- "mov z19.d, z11.d\n"
- "ld1b { z0.h }, p4/Z, [x1]\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
- "mov z9.d, z16.d\n"
- "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z7.d, z11.d\n"
- "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- "mov z6.d, z16.d\n"
- "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z12.d, z11.d\n"
- "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- "mov z8.d, z16.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
- "ld1b { z28.h }, p3/Z, [x25, x2]\n"
- "ld1b { z27.h }, p3/Z, [x24, x2]\n"
- ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
- "ld1b { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
- "ld1b { z25.h }, p3/Z, [x22, x2]\n"
- "ld1b { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
- "ld1b { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
- ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z15.b }, p4/Z, [x21]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x3, x4, [x22, #0x0]\n"
+ "whilelt p3.h, x0, x1\n"
+ "ldp x5, x6, [x22, #0x10]\n"
+ "whilelt p2.s, x0, x1\n"
+ "whilelt p1.s, x24, x1\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x7, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1b { z0.h }, p4/Z, [x2]\n"
+ "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "mov x8, #0x0\n"
+ "mov z20.d, z14.d\n"
+ "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z7.d, z10.d\n"
+ "mov z8.d, z14.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z16.d, z10.d\n"
+ "mov z6.d, z14.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z5.d, z10.d\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ "ld1b { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1b { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ "ld1b { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1b { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ "ld1b { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1b { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
"1:" // Loop
- ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x5, #0x50]\n"
- "whilelt p0.h, x3, x0\n"
- ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- "ldr x19, [x5, #0x58]\n"
- ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
- "ldr x25, [x5, #0x60]\n"
- ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
- ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n"
- "ldr x24, [x5, #0x68]\n"
- ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n"
- "ldr x23, [x5, #0x70]\n"
- ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n"
- "ldr x22, [x5, #0x78]\n"
- ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
- "ld1b { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
- ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
- "ldr x15, [x5, #0x80]\n"
- ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
- ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n"
- "ldr x21, [x5, #0x88]\n"
- ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n"
- "ldr x20, [x5, #0x90]\n"
- ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n"
- "ldr x19, [x5, #0x98]\n"
- ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n"
- "ldr x14, [x5, #0xa0]\n"
- ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n"
- "ldr x13, [x5, #0xa8]\n"
- ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
- "ld1b { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
- "ldr x12, [x5, #0xb0]\n"
- ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x25, x2]\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
- "ldr x11, [x5, #0xb8]\n"
- ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
- "ldr x10, [x5, #0xc0]\n"
- ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n"
- "ldr x9, [x5, #0xc8]\n"
- ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n"
- "ldr x28, [x5, #0xd0]\n"
- ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n"
- "ldr x27, [x5, #0xd8]\n"
- ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
- "ld1b { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
- "ldr x26, [x5, #0xe0]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
- "ldr x25, [x5, #0xe8]\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
- "ld1w { z18.s }, p2/Z, [x4]\n"
- ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n"
- "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
- "addvl x4, x4, #2\n"
- ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
- "ld1b { z3.h }, p4/Z, [x1]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- "uzp1 z21.s, z18.s, z20.s\n"
- "uzp2 z10.s, z18.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x6]\n"
- ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
- "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
- "addvl x6, x6, #2\n"
- ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n"
- "ldr x24, [x5, #0xf0]\n"
- ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n"
- "ldr x23, [x5, #0xf8]\n"
- ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n"
- ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1b { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x7, #0x50]\n"
+ "ld1b { z31.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x7, #0x58]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
+ "ldr x21, [x7, #0x60]\n"
+ "ldr x20, [x7, #0x68]\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
- "uzp1 z29.s, z18.s, z20.s\n"
- "uzp2 z20.s, z18.s, z20.s\n"
- ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
- ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n"
- ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
- "ld1b { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
+ "ldr x25, [x7, #0x70]\n"
+ "ldr x24, [x7, #0x78]\n"
+ ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
+ "ld1b { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
+ "ldr x15, [x7, #0x80]\n"
+ "ldr x23, [x7, #0x88]\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
+ "ld1b { z27.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
- ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n"
- "ldr x22, [x5, #0x100]\n"
- ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n"
- ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n"
- ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
- "ld1b { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ "ldr x22, [x7, #0x90]\n"
+ "ldr x21, [x7, #0x98]\n"
+ ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
+ "ldr x14, [x7, #0xa0]\n"
+ "ldr x13, [x7, #0xa8]\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x2]\n"
- ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
- ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n"
- "ldr x21, [x5, #0x108]\n"
- ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n"
- ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
- "ld1b { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ "ldr x12, [x7, #0xb0]\n"
+ "ldr x20, [x7, #0xb8]\n"
+ ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ "ldr x11, [x7, #0xc0]\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z24.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
- ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n"
- "ldr x20, [x5, #0x110]\n"
- ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n"
- ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n"
- ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
- "ld1b { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
+ "ldr x10, [x7, #0xc8]\n"
+ "ldr x9, [x7, #0xd0]\n"
+ ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z3.h }, p4/Z, [x2]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ "ldr x28, [x7, #0xd8]\n"
+ "ldr x27, [x7, #0xe0]\n"
+ ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
- ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
- "ldr x19, [x5, #0x118]\n"
- ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x2]\n"
- ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
- ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n"
- ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
- "ld1b { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
+ "ldr x26, [x7, #0xe8]\n"
+ "ldr x25, [x7, #0xf0]\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
+ "ld1w { z19.s }, p2/Z, [x17]\n"
+ "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
- "ld1b { z22.h }, p3/Z, [x11, x2]\n"
- ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
- ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n"
- ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
- "ld1b { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
+ "ldr x24, [x7, #0xf8]\n"
+ "uzp1 z9.s, z19.s, z18.s\n"
+ ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
+ "ld1b { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
+ "uzp2 z29.s, z19.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
+ "ld1b { z23.h }, p3/Z, [x15, x0]\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x2]\n"
- ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n"
- ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n"
- ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
- "ld1b { z1.h }, p4/Z, [x1]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
+ "ldr x23, [x7, #0x100]\n"
+ "whilelt p0.h, x8, x1\n"
+ ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
+ "ld1b { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
+ "addvl x17, x17, #2\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x2]\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n"
- ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "ld1b { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
+ "ldr x22, [x7, #0x108]\n"
+ ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
+ "ld1b { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
+ "ld1b { z30.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
- "ld1b { z27.h }, p3/Z, [x10, x2]\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n"
- ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
- ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- "ld1b { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
+ "ldr x21, [x7, #0x110]\n"
+ ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z26.h }, p3/Z, [x14, x0]\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
- "ld1b { z23.h }, p3/Z, [x9, x2]\n"
- ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
- ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
- "ld1b { z28.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
- ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
- ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
- "ld1b { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
+ "ldr x20, [x7, #0x118]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x13, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
- ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
- ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n"
- ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
- "ld1b { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
- ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x12, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
- ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n"
- ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
- "ld1b { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z1.h }, p4/Z, [x2]\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x11, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x25, x2]\n"
- ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
- ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
- ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
- "ld1b { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
+ "ld1b { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
+ "ld1b { z23.h }, p3/Z, [x10, x0]\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n"
- ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
- "ld1b { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n"
- ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1b { z4.h }, p4/Z, [x1]\n"
- "inch x1\n"
- ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n"
- ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n"
- ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
+ "ld1b { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
+ "ld1b { z26.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n"
- ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z25.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x19, x2]\n"
- "inch x2\n"
- ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
- "whilelt p2.s, x2, x0\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "mov x19, x2\n"
- ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z24.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
- "incw x19\n"
- ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n"
- "whilelt p1.s, x19, x0\n"
- ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n"
- "whilelt p3.h, x2, x0\n"
- ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
- ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
+ "ld1b { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n"
- ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n"
- ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n"
- ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
- "and z31.d, z11.d, z29.d\n"
- "asr z31.s, z31.s, #0x1f\n"
- "and z23.d, z16.d, z20.d\n"
- "and z25.d, z19.d, z29.d\n"
- "asr z23.s, z23.s, #0x1f\n"
- "and z18.d, z9.d, z20.d\n"
- ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
- "asr z25.s, z25.s, #0x1f\n"
- ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
+ "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
+ "addvl x16, x16, #2\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
+ "ld1b { z4.h }, p4/Z, [x2]\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ "inch x2\n"
+ ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
+ "uzp1 z23.s, z19.s, z18.s\n"
+ ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
+ "uzp2 z22.s, z19.s, z18.s\n"
+ ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
+ "inch x0\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
+ "and z21.d, z14.d, z23.d\n"
+ "mov x20, x0\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
+ "whilelt p2.s, x0, x1\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ "and z3.d, z10.d, z22.d\n"
+ "whilelt p1.s, x20, x1\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
+ "whilelt p3.h, x0, x1\n"
+ ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sqadd z14.s, z14.s, z21.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
+ "and z19.d, z20.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ "and z18.d, z8.d, z23.d\n"
+ ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
+ "and z21.d, z6.d, z23.d\n"
+ ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
+ "and z1.d, z7.d, z22.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z11.s, z11.s, z31.s\n"
- ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n"
- "sqadd z16.s, z16.s, z23.s\n"
- "sqadd z19.s, z19.s, z25.s\n"
- ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
- "sqadd z9.s, z9.s, z18.s\n"
- "and z1.d, z7.d, z29.d\n"
+ "and z2.d, z16.d, z22.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z3.d, z5.d, z22.d\n"
+ "sqadd z20.s, z20.s, z19.s\n"
+ ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
"asr z1.s, z1.s, #0x1f\n"
- "and z18.d, z6.d, z20.d\n"
- ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
- ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n"
- "and z30.d, z12.d, z29.d\n"
- "asr z30.s, z30.s, #0x1f\n"
- "add z11.s, z11.s, z14.s\n"
- "sqadd z7.s, z7.s, z1.s\n"
- "sqadd z6.s, z6.s, z18.s\n"
- ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
- "smin z11.s, p4/M, z11.s, z15.s\n"
- ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n"
- "sqadd z12.s, z12.s, z30.s\n"
- "and z3.d, z8.d, z20.d\n"
+ "sqadd z8.s, z8.s, z18.s\n"
+ ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z21.s\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
"asr z3.s, z3.s, #0x1f\n"
- "add z16.s, z16.s, z14.s\n"
- "smax z11.s, p4/M, z11.s, z5.s\n"
- ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n"
- ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n"
- "smin z16.s, p4/M, z16.s, z15.s\n"
- ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
- "add z19.s, z19.s, z14.s\n"
- "add z9.s, z9.s, z14.s\n"
- "sqadd z8.s, z8.s, z3.s\n"
- "add z7.s, z7.s, z14.s\n"
- "smax z16.s, p4/M, z16.s, z5.s\n"
- "smin z19.s, p4/M, z19.s, z15.s\n"
- "smin z9.s, p4/M, z9.s, z15.s\n"
- "smin z7.s, p4/M, z7.s, z15.s\n"
- "trn1 z11.h, z11.h, z16.h\n"
- "st1b { z11.h }, p0, [x7, x3]\n"
- "smax z19.s, p4/M, z19.s, z5.s\n"
- "smax z9.s, p4/M, z9.s, z5.s\n"
- "smax z7.s, p4/M, z7.s, z5.s\n"
- ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n"
- ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n"
- "trn1 z19.h, z19.h, z9.h\n"
- "st1b { z19.h }, p0, [x8, x3]\n"
- "add z6.s, z6.s, z14.s\n"
- ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n"
- "add z12.s, z12.s, z14.s\n"
- "smin z6.s, p4/M, z6.s, z15.s\n"
- "add z8.s, z8.s, z14.s\n"
- "smin z12.s, p4/M, z12.s, z15.s\n"
- "smax z6.s, p4/M, z6.s, z5.s\n"
- "smin z8.s, p4/M, z8.s, z15.s\n"
- "smax z12.s, p4/M, z12.s, z5.s\n"
- "trn1 z7.h, z7.h, z6.h\n"
- "st1b { z7.h }, p0, [x17, x3]\n"
- "smax z8.s, p4/M, z8.s, z5.s\n"
- "trn1 z12.h, z12.h, z8.h\n"
- "st1b { z12.h }, p0, [x16, x3]\n"
- "inch x3\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z19.s }, p2/Z, [x19]\n"
- "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z11.s, z19.s, z6.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z16.s, z19.s, z6.s\n"
- "mov z19.d, z11.d\n"
- "ld1b { z0.h }, p4/Z, [x1]\n"
- ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
- "mov z9.d, z16.d\n"
- "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z7.d, z11.d\n"
- "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
- "mov z6.d, z16.d\n"
- "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z12.d, z11.d\n"
- "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- "mov z8.d, z16.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
- "ld1b { z28.h }, p3/Z, [x25, x2]\n"
- "ld1b { z27.h }, p3/Z, [x24, x2]\n"
- ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
- "ld1b { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
- "ld1b { z25.h }, p3/Z, [x22, x2]\n"
- "ld1b { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
- "ld1b { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
- ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
- ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
- ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ "sqadd z7.s, z7.s, z1.s\n"
+ ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z5.s, z5.s, z3.s\n"
+ ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
+ ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
+ ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
+ ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
+ ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
+ "sqadd z14.h, z14.h, z12.h\n"
+ "sqadd z20.h, z20.h, z12.h\n"
+ "smax z14.h, p4/M, z14.h, z13.h\n"
+ "smax z20.h, p4/M, z20.h, z13.h\n"
+ "sqadd z8.h, z8.h, z12.h\n"
+ "sqadd z6.h, z6.h, z12.h\n"
+ "smax z8.h, p4/M, z8.h, z13.h\n"
+ "smax z6.h, p4/M, z6.h, z13.h\n"
+ "smin z14.h, p4/M, z14.h, z11.h\n"
+ "smin z20.h, p4/M, z20.h, z11.h\n"
+ "st1b { z14.h }, p0, [x3, x8]\n"
+ "smin z8.h, p4/M, z8.h, z11.h\n"
+ "smin z6.h, p4/M, z6.h, z11.h\n"
+ "st1b { z20.h }, p0, [x4, x8]\n"
+ "st1b { z8.h }, p0, [x5, x8]\n"
+ "st1b { z6.h }, p0, [x6, x8]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1b { z0.h }, p4/Z, [x2]\n"
+ "ld1b { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1b { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "inch x8\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z20.d, z14.d\n"
+ "mov z7.d, z10.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z8.d, z14.d\n"
+ "mov z16.d, z10.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z6.d, z14.d\n"
+ "mov z5.d, z10.d\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ "ld1b { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1b { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511842 // usublb z2.h, z2.b, z17.b\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ "ld1b { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1b { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x45511884 // usublb z4.h, z4.b, z17.b\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ "ld1b { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1b { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 1c8b8f9d19..66c24c34b5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,309 +41,295 @@ void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov z31.s, #0x0\n"
- "ldr x24, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "mov z18.s, #0x0\n"
+ "mov x20, #0x9\n"
+ "whilelt p0.b, XZR, x20\n"
"ldr x23, [%x[inptrs], #0x8]\n"
- "lsl x9, %x[n_channels], #0x2\n"
- "mov z29.s, #0x0\n"
- "ldr x22, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "mov z28.s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
- "mov x19, #0x9\n"
- "mov z13.s, #0x0\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "whilelt p1.b, XZR, x19\n"
- "mov z14.s, #0x0\n"
- "ld1b { z7.b }, p1/Z, [x24]\n"
- "mov x19, #0x3\n"
- "mov z15.s, #0x0\n"
- "ld1b { z3.b }, p1/Z, [x23]\n"
- "whilelt p0.b, XZR, x19\n"
- "mov z11.b, p0/z, #0x1\n"
- "ld1b { z4.b }, p1/Z, [x22]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x20]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov z15.b, #0x1\n"
+ "lsr z15.s, z15.s, #0x8\n"
+ "ld1b { z1.b }, p0/Z, [x23]\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z30.d, z1.d\n"
+ "mov z29.d, z1.d\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z28.d, z1.d\n"
+ "mov z27.d, z2.d\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z26.d, z2.d\n"
+ "mov z25.d, z2.d\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z24.d, z4.d\n"
+ "mov z23.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z22.d, z4.d\n"
+ "ext z30.b, z30.b, z30.b, #0x2\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z14.s, p2/M, z14.s\n"
+ "ext z29.b, z29.b, z29.b, #0x4\n"
+ "ext z28.b, z28.b, z28.b, #0x6\n"
+ "mov x9, #0x0\n"
+ "whilelt p1.b, x9, x10\n"
+ "ext z27.b, z27.b, z27.b, #0x2\n"
+ "ext z26.b, z26.b, z26.b, #0x4\n"
+ "ld1w { z13.s }, p1/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "mov z10.d, z7.d\n"
- "ld1b { z6.b }, p1/Z, [x21]\n"
- "mov x27, #0x0\n"
- "ext z10.b, z10.b, z10.b, #0x2\n"
- "ld1b { z5.b }, p1/Z, [x20]\n"
- "whilelt p1.b, x28, x9\n"
- "mov z17.d, z7.d\n"
- "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z26.d, z7.d\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
- "mov z19.d, z3.d\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "ext z19.b, z19.b, z19.b, #0x2\n"
+ "ext z25.b, z25.b, z25.b, #0x6\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z23.b, z23.b, z23.b, #0x4\n"
+ "ext z22.b, z22.b, z22.b, #0x6\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "mov z21.d, z0.d\n"
+ "mov z20.d, z0.d\n"
"ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z7.s, z7.s, z17.s\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "zip1 z10.s, z10.s, z26.s\n"
- "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "zip1 z7.s, z7.s, z10.s\n"
- "ld1w { z1.s }, p1/Z, [%x[params]]\n"
- "mov z7.q, z7.q[0]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z19.d, z0.d\n"
+ "mov z18.d, z3.d\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p1/Z, [%x[params], #1, MUL VL]\n"
"mov z17.d, z3.d\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z16.d, z3.d\n"
+ "ld1b { z6.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
- "mov z2.d, z3.d\n"
- "mov z20.d, z4.d\n"
- "ext z2.b, z2.b, z2.b, #0x6\n"
- "zip1 z3.s, z3.s, z17.s\n"
- "ext z20.b, z20.b, z20.b, #0x2\n"
- "mov z17.d, z4.d\n"
- "zip1 z19.s, z19.s, z2.s\n"
- "zip1 z3.s, z3.s, z19.s\n"
- "mov z3.q, z3.q[0]\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "zip1 z1.s, z1.s, z29.s\n"
+ "zip1 z30.s, z30.s, z28.s\n"
+ "zip1 z2.s, z2.s, z26.s\n"
+ "zip1 z27.s, z27.s, z25.s\n"
+ "ext z18.b, z18.b, z18.b, #0x2\n"
"ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z26.d, z4.d\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "mov z21.d, z6.d\n"
- "zip1 z4.s, z4.s, z17.s\n"
- "ext z21.b, z21.b, z21.b, #0x2\n"
- "zip1 z20.s, z20.s, z26.s\n"
- "zip1 z4.s, z4.s, z20.s\n"
+ "ext z16.b, z16.b, z16.b, #0x6\n"
+ "zip1 z4.s, z4.s, z23.s\n"
+ "zip1 z24.s, z24.s, z22.s\n"
+ "zip1 z0.s, z0.s, z20.s\n"
+ "zip1 z21.s, z21.s, z19.s\n"
+ "zip1 z1.s, z1.s, z30.s\n"
+ "zip1 z2.s, z2.s, z27.s\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "zip1 z18.s, z18.s, z16.s\n"
+ "zip1 z4.s, z4.s, z24.s\n"
+ "zip1 z0.s, z0.s, z21.s\n"
+ "mov z1.q, z1.q[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z3.s, z3.s, z18.s\n"
"mov z4.q, z4.q[0]\n"
- "mov z17.d, z6.d\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z20.d, z6.d\n"
- "ext z20.b, z20.b, z20.b, #0x6\n"
- "mov z19.d, z5.d\n"
- "zip1 z6.s, z6.s, z17.s\n"
- "ext z19.b, z19.b, z19.b, #0x2\n"
- "zip1 z21.s, z21.s, z20.s\n"
- "zip1 z6.s, z6.s, z21.s\n"
- "mov z6.q, z6.q[0]\n"
- "mov z17.d, z5.d\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "mov z20.d, z5.d\n"
- "ext z20.b, z20.b, z20.b, #0x6\n"
- "mov z11.s, z11.s[0]\n"
- "zip1 z5.s, z5.s, z17.s\n"
- "mov z25.s, #0x0\n"
- "zip1 z19.s, z19.s, z20.s\n"
- "zip1 z5.s, z5.s, z19.s\n"
- "mov z5.q, z5.q[0]\n"
- "mov z26.s, #0x0\n"
- "mov z27.s, #0x0\n"
"mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "udot z24.s, z15.b, z1.b[0]\n"
"mov z23.s, #0x0\n"
"mov z22.s, #0x0\n"
+ "udot z25.s, z15.b, z1.b[1]\n"
"mov z21.s, #0x0\n"
- "mov z17.s, #0x0\n"
"mov z20.s, #0x0\n"
- "mov z2.s, #0x0\n"
+ "udot z23.s, z15.b, z1.b[2]\n"
+ "mov z9.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "udot z22.s, z15.b, z1.b[3]\n"
"mov z19.s, #0x0\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "udot z18.s, z11.b, z7.b[1]\n"
- "udot z29.s, z11.b, z7.b[2]\n"
- "udot z28.s, z11.b, z7.b[3]\n"
- "udot z13.s, z11.b, z3.b[0]\n"
- "udot z14.s, z11.b, z3.b[1]\n"
- "udot z15.s, z11.b, z3.b[2]\n"
- "udot z25.s, z11.b, z3.b[3]\n"
- "udot z26.s, z11.b, z4.b[0]\n"
- "udot z27.s, z11.b, z4.b[1]\n"
- "udot z24.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z4.b[3]\n"
- "udot z22.s, z11.b, z6.b[0]\n"
- "udot z21.s, z11.b, z6.b[1]\n"
- "udot z17.s, z11.b, z6.b[2]\n"
- "udot z20.s, z11.b, z6.b[3]\n"
- "udot z2.s, z11.b, z5.b[0]\n"
- "udot z19.s, z11.b, z5.b[1]\n"
- "mov z31.d, z31.d\n"
- "mov z18.d, z18.d\n"
- "mov z29.d, z29.d\n"
- "mov z28.d, z28.d\n"
- "add z31.s, z31.s, z13.s\n"
- "mov z13.s, #0x0\n"
- "udot z13.s, z11.b, z5.b[2]\n"
- "add z18.s, z18.s, z14.s\n"
- "mov z14.s, #0x0\n"
- "udot z14.s, z11.b, z5.b[3]\n"
- "add z29.s, z29.s, z15.s\n"
- "add z28.s, z28.s, z25.s\n"
- "add z31.s, z31.s, z26.s\n"
- "add z18.s, z18.s, z27.s\n"
- "add z29.s, z29.s, z24.s\n"
- "add z28.s, z28.s, z23.s\n"
- "mov z26.d, z26.d\n"
- "mov z25.d, z27.d\n"
- "mov z24.d, z24.d\n"
- "mov z23.d, z23.d\n"
- "add z26.s, z26.s, z22.s\n"
- "add z25.s, z25.s, z21.s\n"
- "add z24.s, z24.s, z17.s\n"
- "add z23.s, z23.s, z20.s\n"
- "add z26.s, z26.s, z2.s\n"
- "add z25.s, z25.s, z19.s\n"
+ "mov z18.s, #0x0\n"
+ "udot z21.s, z15.b, z2.b[0]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "udot z20.s, z15.b, z2.b[1]\n"
+ "udot z9.s, z15.b, z2.b[2]\n"
+ "udot z8.s, z15.b, z2.b[3]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z19.s, z15.b, z4.b[0]\n"
+ "udot z18.s, z15.b, z4.b[1]\n"
+ "mov z3.q, z3.q[0]\n"
+ "udot z17.s, z15.b, z4.b[2]\n"
+ "udot z16.s, z15.b, z4.b[3]\n"
+ "mov z31.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "udot z31.s, z15.b, z0.b[0]\n"
+ "mov z28.s, #0x0\n"
+ "udot z30.s, z15.b, z0.b[1]\n"
+ "udot z29.s, z15.b, z0.b[2]\n"
+ "udot z28.s, z15.b, z0.b[3]\n"
+ "add z24.s, z24.s, z21.s\n"
+ "add z25.s, z25.s, z20.s\n"
+ "add z26.s, z23.s, z9.s\n"
+ "add z27.s, z22.s, z8.s\n"
+ "add z23.s, z19.s, z21.s\n"
+ "mov z22.s, #0x0\n"
+ "udot z22.s, z15.b, z3.b[0]\n"
+ "add z21.s, z18.s, z20.s\n"
+ "mov z20.s, #0x0\n"
+ "udot z20.s, z15.b, z3.b[1]\n"
+ "add z19.s, z17.s, z9.s\n"
+ "mov z18.s, #0x0\n"
+ "udot z18.s, z15.b, z3.b[2]\n"
+ "add z17.s, z16.s, z8.s\n"
+ "mov z16.s, #0x0\n"
+ "udot z16.s, z15.b, z3.b[3]\n"
+ "add z24.s, z24.s, z31.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "mul z24.s, p2/M, z24.s, z14.s\n"
+ "mul z25.s, p2/M, z25.s, z14.s\n"
+ "add z26.s, z26.s, z29.s\n"
+ "add z27.s, z27.s, z28.s\n"
+ "mul z26.s, p2/M, z26.s, z14.s\n"
+ "mul z27.s, p2/M, z27.s, z14.s\n"
+ "add z28.s, z23.s, z22.s\n"
+ "add z29.s, z21.s, z20.s\n"
+ "mul z28.s, p2/M, z28.s, z14.s\n"
+ "mul z29.s, p2/M, z29.s, z14.s\n"
+ "add z30.s, z19.s, z18.s\n"
+ "add z31.s, z17.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z14.s\n"
+ "mul z31.s, p2/M, z31.s, z14.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
"add z24.s, z24.s, z13.s\n"
- "add z23.s, z23.s, z14.s\n"
- "neg z30.s, p2/M, z30.s\n"
- "mul z31.s, p2/M, z31.s, z30.s\n"
- "st1w { z31.s }, p2, [SP]\n"
- "add z31.s, z31.s, z1.s\n"
- "mul z18.s, p2/M, z18.s, z30.s\n"
- "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
- "add z18.s, z18.s, z1.s\n"
- "mul z29.s, p2/M, z29.s, z30.s\n"
- "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
- "add z29.s, z29.s, z1.s\n"
- "mul z28.s, p2/M, z28.s, z30.s\n"
- "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
- "add z28.s, z28.s, z1.s\n"
- "mul z26.s, p2/M, z26.s, z30.s\n"
- "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
- "add z26.s, z26.s, z1.s\n"
- "mul z25.s, p2/M, z25.s, z30.s\n"
- "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z1.s\n"
- "mul z24.s, p2/M, z24.s, z30.s\n"
- "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
- "add z24.s, z24.s, z1.s\n"
- "mul z23.s, p2/M, z23.s, z30.s\n"
- "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
- "add z23.s, z23.s, z1.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"1:" // Loop
- "udot z31.s, z8.b, z7.b[0]\n"
- "ld1w { z22.s }, p2/Z, [%x[params]]\n"
- "incb x28\n"
- "udot z18.s, z8.b, z7.b[1]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "whilelt p0.s, x27, %x[n_channels]\n"
- "udot z29.s, z8.b, z7.b[2]\n"
- "whilelt p1.b, x28, x9\n"
- "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
- "udot z28.s, z8.b, z7.b[3]\n"
- "udot z26.s, z8.b, z4.b[0]\n"
- "udot z25.s, z8.b, z4.b[1]\n"
- "udot z24.s, z8.b, z4.b[2]\n"
- "udot z23.s, z8.b, z4.b[3]\n"
- "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "udot z31.s, z9.b, z3.b[0]\n"
- "udot z18.s, z9.b, z3.b[1]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "udot z28.s, z9.b, z3.b[3]\n"
- "udot z26.s, z9.b, z6.b[0]\n"
- "udot z25.s, z9.b, z6.b[1]\n"
- "udot z24.s, z9.b, z6.b[2]\n"
- "udot z23.s, z9.b, z6.b[3]\n"
- "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "udot z31.s, z10.b, z4.b[0]\n"
- "udot z18.s, z10.b, z4.b[1]\n"
- "udot z29.s, z10.b, z4.b[2]\n"
- "udot z28.s, z10.b, z4.b[3]\n"
- "udot z26.s, z10.b, z5.b[0]\n"
- "udot z25.s, z10.b, z5.b[1]\n"
- "udot z24.s, z10.b, z5.b[2]\n"
- "udot z23.s, z10.b, z5.b[3]\n"
- "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "udot z24.s, z5.b, z0.b[0]\n"
+ "udot z25.s, z5.b, z0.b[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z26.s, z5.b, z0.b[2]\n"
+ "udot z27.s, z5.b, z0.b[3]\n"
+ "incb x9\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z24.s, z6.b, z1.b[0]\n"
+ "udot z25.s, z6.b, z1.b[1]\n"
+ "whilelt p1.b, x9, x10\n"
+ "ld1w { z13.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "udot z26.s, z6.b, z1.b[2]\n"
+ "udot z27.s, z6.b, z1.b[3]\n"
+ "udot z28.s, z5.b, z2.b[0]\n"
+ "udot z29.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z2.b[2]\n"
+ "udot z31.s, z5.b, z2.b[3]\n"
+ "ld1b { z5.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[0]\n"
+ "udot z25.s, z7.b, z2.b[1]\n"
+ ".inst 0x04b57718 // sqrdmulh z24.s, z24.s, z21.s\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ ".inst 0x04b5775a // sqrdmulh z26.s, z26.s, z21.s\n"
+ "udot z30.s, z6.b, z3.b[2]\n"
+ "udot z31.s, z6.b, z3.b[3]\n"
+ ".inst 0x04b5777b // sqrdmulh z27.s, z27.s, z21.s\n"
+ "ld1b { z6.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "udot z28.s, z7.b, z4.b[0]\n"
+ "udot z29.s, z7.b, z4.b[1]\n"
+ "and z19.d, z24.d, z20.d\n"
+ "udot z30.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z4.b[3]\n"
+ "and z18.d, z25.d, z20.d\n"
+ "ld1b { z7.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z20.d\n"
+ "and z16.d, z27.d, z20.d\n"
"addvl %x[params], %x[params], #6\n"
- ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n"
- ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
- ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
- ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
- ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
- "and z20.d, z31.d, z21.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z18.d, z21.d\n"
- "and z14.d, z29.d, z21.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z17.d, z28.d, z21.d\n"
- "and z2.d, z26.d, z21.d\n"
- "asr z14.s, z14.s, #0x1f\n"
- ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z31.s, z31.s, z20.s\n"
- ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- "sqadd z18.s, z18.s, z19.s\n"
- "sqadd z29.s, z29.s, z14.s\n"
- "and z27.d, z25.d, z21.d\n"
- "asr z27.s, z27.s, #0x1f\n"
- "sqadd z28.s, z28.s, z17.s\n"
- "sqadd z26.s, z26.s, z2.s\n"
- "and z17.d, z24.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b5779c // sqrdmulh z28.s, z28.s, z21.s\n"
+ ".inst 0x04b577bd // sqrdmulh z29.s, z29.s, z21.s\n"
+ ".inst 0x04b577de // sqrdmulh z30.s, z30.s, z21.s\n"
+ ".inst 0x04b577ff // sqrdmulh z31.s, z31.s, z21.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
+ ".inst 0x44828a9b // srshl z27.s, p2/M, z27.s, z20.s\n"
+ "and z19.d, z28.d, z20.d\n"
+ "and z18.d, z29.d, z20.d\n"
+ "and z17.d, z30.d, z20.d\n"
+ "and z16.d, z31.d, z20.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
- "and z15.d, z23.d, z21.d\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "asr z15.s, z15.s, #0x1f\n"
- "sqadd z25.s, z25.s, z27.s\n"
- ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n"
- "add z31.s, z31.s, z12.s\n"
- "sqadd z24.s, z24.s, z17.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
- "add z18.s, z18.s, z12.s\n"
- "sqadd z23.s, z23.s, z15.s\n"
- "smin z31.s, p2/M, z31.s, z0.s\n"
- "add z29.s, z29.s, z12.s\n"
- "smin z18.s, p2/M, z18.s, z0.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "smax z31.s, p2/M, z31.s, z16.s\n"
- "st1b { z31.s }, p0, [x26, x27]\n"
- "add z28.s, z28.s, z12.s\n"
- "smax z18.s, p2/M, z18.s, z16.s\n"
- "ld1w { z31.s }, p2/Z, [SP]\n"
- "smin z29.s, p2/M, z29.s, z0.s\n"
- "st1b { z18.s }, p0, [x25, x27]\n"
- "add z31.s, z31.s, z1.s\n"
- "smin z28.s, p2/M, z28.s, z0.s\n"
- "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
- "smax z29.s, p2/M, z29.s, z16.s\n"
- "st1b { z29.s }, p0, [x24, x27]\n"
- "add z18.s, z18.s, z1.s\n"
- "smax z28.s, p2/M, z28.s, z16.s\n"
- "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "st1b { z28.s }, p0, [x23, x27]\n"
- "add z29.s, z29.s, z1.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
- "add z26.s, z26.s, z12.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
- "add z25.s, z25.s, z12.s\n"
- "add z28.s, z28.s, z1.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
+ ".inst 0x44828a9d // srshl z29.s, p2/M, z29.s, z20.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828a9e // srshl z30.s, p2/M, z30.s, z20.s\n"
+ ".inst 0x44828a9f // srshl z31.s, p2/M, z31.s, z20.s\n"
"add z24.s, z24.s, z12.s\n"
- "add z23.s, z23.s, z12.s\n"
- "smin z26.s, p2/M, z26.s, z0.s\n"
- "smin z25.s, p2/M, z25.s, z0.s\n"
- "smin z24.s, p2/M, z24.s, z0.s\n"
- "smin z23.s, p2/M, z23.s, z0.s\n"
- "smax z26.s, p2/M, z26.s, z16.s\n"
- "st1b { z26.s }, p0, [x22, x27]\n"
- "smax z25.s, p2/M, z25.s, z16.s\n"
- "smax z24.s, p2/M, z24.s, z16.s\n"
- "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
- "smax z23.s, p2/M, z23.s, z16.s\n"
- "st1b { z25.s }, p0, [x21, x27]\n"
- "add z26.s, z26.s, z1.s\n"
- "st1b { z24.s }, p0, [x20, x27]\n"
- "st1b { z23.s }, p0, [x19, x27]\n"
- "incw x27\n"
- "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z1.s\n"
- "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
- "add z24.s, z24.s, z1.s\n"
- "add z23.s, z23.s, z1.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "smin z25.s, p2/M, z25.s, z10.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "smin z26.s, p2/M, z26.s, z10.s\n"
+ "smin z27.s, p2/M, z27.s, z10.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "smin z28.s, p2/M, z28.s, z10.s\n"
+ "smin z29.s, p2/M, z29.s, z10.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "smin z30.s, p2/M, z30.s, z10.s\n"
+ "smin z31.s, p2/M, z31.s, z10.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smax z25.s, p2/M, z25.s, z11.s\n"
+ "st1b { z24.s }, p0, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z11.s\n"
+ "smax z27.s, p2/M, z27.s, z11.s\n"
+ "st1b { z25.s }, p0, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z11.s\n"
+ "smax z29.s, p2/M, z29.s, z11.s\n"
+ "st1b { z26.s }, p0, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z11.s\n"
+ "smax z31.s, p2/M, z31.s, z11.s\n"
+ "st1b { z27.s }, p0, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p0, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "st1b { z29.s }, p0, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "st1b { z30.s }, p0, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "st1b { z31.s }, p0, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 0085bbc6bc..debaa8c296 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,384 +41,358 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov z20.b, #0x1\n"
- "ldr x24, [%x[inptrs], #0x0]\n"
- "ptrue p2.b\n"
- "mov z22.s, #0x1\n"
- "ldr x23, [%x[inptrs], #0x8]\n"
- "lsl x9, %x[n_channels], #0x2\n"
- "mov z30.s, #0x0\n"
- "ldr x22, [%x[inptrs], #0x10]\n"
- "addvl SP, SP, #-8\n"
- "mov z28.s, #0x0\n"
- "ldr x21, [%x[inptrs], #0x18]\n"
"mov x20, #0x6\n"
- "mov z29.s, #0x0\n"
- "ldr x19, [%x[inptrs], #0x20]\n"
"whilelt p0.b, XZR, x20\n"
- "mov z27.s, #0x0\n"
- "ld1b { z0.b }, p0/Z, [x24]\n"
- "mov x28, #0x0\n"
- "mov z26.s, #0x0\n"
- "ld1b { z3.b }, p0/Z, [x23]\n"
- "mov x27, #0x0\n"
- "mov z25.s, #0x0\n"
- "ld1b { z5.b }, p0/Z, [x22]\n"
- "whilelt p1.b, x28, x9\n"
- "mov z15.d, z0.d\n"
+ "ldr x22, [%x[inptrs], #0x18]\n"
+ "ldr x21, [%x[inptrs], #0x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1b { z3.b }, p0/Z, [x22]\n"
+ "mov z20.d, z3.d\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
"ld1b { z4.b }, p0/Z, [x21]\n"
- "mov z24.s, #0x0\n"
- "ld1b { z6.b }, p0/Z, [x19]\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "mov z18.d, z4.d\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "ldr x23, [%x[inptrs], #0x28]\n"
+ "mov z15.d, z2.d\n"
"ext z15.b, z15.b, z15.b, #0x1\n"
- "ldr x21, [%x[inptrs], #0x28]\n"
- "mov z16.d, z3.d\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "ldr x19, [%x[inptrs], #0x38]\n"
- "mov z18.d, z5.d\n"
+ "ldr x22, [%x[inptrs], #0x30]\n"
+ "ldr x21, [%x[inptrs], #0x38]\n"
+ "zip1 z3.d, z3.d, z20.d\n"
+ "zip1 z4.d, z4.d, z18.d\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "mov z20.d, z1.d\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "ld1b { z5.b }, p0/Z, [x23]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "mov z13.d, z5.d\n"
+ "mov z19.d, z6.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
- "zip1 z0.d, z0.d, z15.d\n"
- "ld1b { z1.b }, p0/Z, [x20]\n"
- "mov z0.q, z0.q[0]\n"
- "ld1b { z2.b }, p0/Z, [x19]\n"
- "zip1 z3.d, z3.d, z16.d\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1b { z0.b }, p0/Z, [x20]\n"
+ "mov z25.d, z7.d\n"
+ "zip1 z2.d, z2.d, z15.d\n"
"mov z3.q, z3.q[0]\n"
- "ldp x26, x25, [%x[outptrs], #0x0]\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ldp x24, x23, [%x[outptrs], #0x10]\n"
- "mov z16.d, z4.d\n"
- "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z23.s, p2/M, z23.s\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ "mov z30.b, #0x1\n"
+ "mov x9, #0x0\n"
+ "whilelt p1.b, x9, x10\n"
+ "mov z24.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "udot z24.s, z30.b, z3.b[0]\n"
+ "ld1w { z12.s }, p1/Z, [%x[params]]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "udot z28.s, z30.b, z3.b[2]\n"
+ "mov x28, #0x0\n"
+ "mov z16.d, z0.d\n"
+ "udot z18.s, z30.b, z4.b[0]\n"
+ "udot z17.s, z30.b, z4.b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
"ext z16.b, z16.b, z16.b, #0x1\n"
- "ldp x20, x19, [%x[outptrs], #0x30]\n"
- "mov z17.d, z6.d\n"
+ "zip1 z1.d, z1.d, z20.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z5.d, z5.d, z13.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
"ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z5.d, z5.d, z18.d\n"
- "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "mov z5.q, z5.q[0]\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "zip1 z4.d, z4.d, z16.d\n"
- "ld1w { z13.s }, p1/Z, [%x[params]]\n"
- "mov z4.q, z4.q[0]\n"
+ "zip1 z6.d, z6.d, z19.d\n"
+ "zip1 z7.d, z7.d, z25.d\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z26.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "udot z26.s, z30.b, z2.b[0]\n"
"ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z29.s, #0x1\n"
+ "udot z22.s, z30.b, z2.b[2]\n"
+ "udot z24.s, z29.b, z3.b[1]\n"
"ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
- "mov z16.d, z7.d\n"
+ "zip1 z0.d, z0.d, z16.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "udot z28.s, z29.b, z3.b[3]\n"
"ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
- "addvl %x[params], %x[params], #5\n"
- "zip1 z6.d, z6.d, z17.d\n"
- "mov z17.d, z1.d\n"
+ "mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "zip1 z7.d, z7.d, z16.d\n"
+ "udot z18.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
"mov z7.q, z7.q[0]\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
- "mov z16.d, z2.d\n"
- "ext z16.b, z16.b, z16.b, #0x1\n"
- "mov z23.s, #0x0\n"
- "zip1 z1.d, z1.d, z17.d\n"
- "mov z1.q, z1.q[0]\n"
- "zip1 z2.d, z2.d, z16.d\n"
- "mov z2.q, z2.q[0]\n"
- "mov z18.s, #0x0\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
"mov z21.s, #0x0\n"
+ "udot z17.s, z29.b, z4.b[3]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "mov z20.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "udot z21.s, z30.b, z1.b[0]\n"
+ "mov z27.s, #0x0\n"
"mov z19.s, #0x0\n"
- "udot z30.s, z20.b, z0.b[0]\n"
- "udot z28.s, z20.b, z0.b[2]\n"
- "udot z29.s, z20.b, z3.b[0]\n"
- "udot z27.s, z20.b, z3.b[2]\n"
- "udot z30.s, z22.b, z0.b[1]\n"
- "udot z28.s, z22.b, z0.b[3]\n"
- "udot z29.s, z22.b, z3.b[1]\n"
- "udot z27.s, z22.b, z3.b[3]\n"
- "udot z26.s, z20.b, z5.b[0]\n"
- "udot z25.s, z20.b, z5.b[2]\n"
- "udot z24.s, z20.b, z4.b[0]\n"
- "udot z23.s, z20.b, z4.b[2]\n"
- "udot z26.s, z22.b, z5.b[1]\n"
- "udot z25.s, z22.b, z5.b[3]\n"
- "udot z24.s, z22.b, z4.b[1]\n"
- "udot z23.s, z22.b, z4.b[3]\n"
- "udot z18.s, z20.b, z6.b[0]\n"
- "udot z17.s, z20.b, z6.b[2]\n"
- "udot z16.s, z20.b, z7.b[0]\n"
- "udot z21.s, z20.b, z7.b[2]\n"
- "udot z18.s, z22.b, z6.b[1]\n"
- "udot z17.s, z22.b, z6.b[3]\n"
- "udot z16.s, z22.b, z7.b[1]\n"
- "udot z21.s, z22.b, z7.b[3]\n"
- "udot z19.s, z20.b, z1.b[0]\n"
- "mov z30.d, z30.d\n"
- "mov z28.d, z28.d\n"
- "add z30.s, z30.s, z29.s\n"
- "udot z19.s, z22.b, z1.b[1]\n"
- "add z28.s, z28.s, z27.s\n"
- "add z30.s, z30.s, z26.s\n"
- "mov z29.d, z29.d\n"
- "add z28.s, z28.s, z25.s\n"
- "add z30.s, z30.s, z24.s\n"
- "mov z27.d, z27.d\n"
- "add z28.s, z28.s, z23.s\n"
- "add z30.s, z30.s, z18.s\n"
- "add z29.s, z29.s, z26.s\n"
- "add z28.s, z28.s, z17.s\n"
- "add z27.s, z27.s, z25.s\n"
- "add z29.s, z29.s, z24.s\n"
- "mov z26.d, z26.d\n"
- "add z27.s, z27.s, z23.s\n"
- "add z29.s, z29.s, z18.s\n"
- "mov z25.d, z25.d\n"
- "add z27.s, z27.s, z17.s\n"
- "add z29.s, z29.s, z16.s\n"
- "add z26.s, z26.s, z24.s\n"
- "add z27.s, z27.s, z21.s\n"
- "add z25.s, z25.s, z23.s\n"
- "add z26.s, z26.s, z18.s\n"
- "mov z24.d, z24.d\n"
- "add z25.s, z25.s, z17.s\n"
- "add z26.s, z26.s, z16.s\n"
- "mov z23.d, z23.d\n"
- "add z25.s, z25.s, z21.s\n"
- "add z26.s, z26.s, z19.s\n"
+ "udot z20.s, z30.b, z1.b[2]\n"
+ "udot z25.s, z30.b, z5.b[0]\n"
+ "udot z27.s, z30.b, z5.b[2]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z19.s, z30.b, z6.b[0]\n"
+ "udot z26.s, z29.b, z2.b[1]\n"
"add z24.s, z24.s, z18.s\n"
"mov z18.s, #0x0\n"
- "udot z18.s, z20.b, z1.b[2]\n"
- "add z23.s, z23.s, z17.s\n"
- "mov z17.s, #0x0\n"
- "udot z17.s, z20.b, z2.b[0]\n"
- "udot z18.s, z22.b, z1.b[3]\n"
- "add z24.s, z24.s, z16.s\n"
+ "udot z18.s, z30.b, z6.b[2]\n"
+ "udot z22.s, z29.b, z2.b[3]\n"
+ "add z17.s, z28.s, z17.s\n"
+ "mov z16.s, #0x0\n"
+ "udot z16.s, z30.b, z7.b[0]\n"
+ "udot z21.s, z29.b, z1.b[1]\n"
+ "udot z20.s, z29.b, z1.b[3]\n"
+ "add z28.s, z26.s, z24.s\n"
+ "udot z25.s, z29.b, z5.b[1]\n"
+ "udot z27.s, z29.b, z5.b[3]\n"
+ "add z31.s, z22.s, z17.s\n"
+ "udot z19.s, z29.b, z6.b[1]\n"
+ "udot z18.s, z29.b, z6.b[3]\n"
+ "add z22.s, z21.s, z28.s\n"
+ "udot z16.s, z29.b, z7.b[1]\n"
+ "add z21.s, z20.s, z31.s\n"
+ "add z20.s, z25.s, z19.s\n"
+ "add z19.s, z27.s, z18.s\n"
+ "add z18.s, z16.s, z24.s\n"
+ "mov z16.s, #0x0\n"
+ "udot z16.s, z30.b, z7.b[2]\n"
+ "udot z16.s, z29.b, z7.b[3]\n"
+ "add z17.s, z16.s, z17.s\n"
"mov z16.s, #0x0\n"
- "udot z17.s, z22.b, z2.b[1]\n"
- "udot z16.s, z20.b, z2.b[2]\n"
- "add z25.s, z25.s, z18.s\n"
- "add z23.s, z23.s, z21.s\n"
- "add z24.s, z24.s, z19.s\n"
- "udot z16.s, z22.b, z2.b[3]\n"
- "add z23.s, z23.s, z18.s\n"
- "add z24.s, z24.s, z17.s\n"
- "neg z15.s, p2/M, z15.s\n"
- "add z23.s, z23.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z15.s\n"
- "st1w { z30.s }, p2, [SP]\n"
- "add z30.s, z30.s, z13.s\n"
- "mul z28.s, p2/M, z28.s, z15.s\n"
- "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
- "add z28.s, z28.s, z13.s\n"
- "mul z29.s, p2/M, z29.s, z15.s\n"
- "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
- "add z29.s, z29.s, z13.s\n"
- "mul z27.s, p2/M, z27.s, z15.s\n"
- "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
- "add z27.s, z27.s, z13.s\n"
- "mul z26.s, p2/M, z26.s, z15.s\n"
- "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
- "add z26.s, z26.s, z13.s\n"
- "mul z25.s, p2/M, z25.s, z15.s\n"
- "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z13.s\n"
- "mul z24.s, p2/M, z24.s, z15.s\n"
- "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
- "add z24.s, z24.s, z13.s\n"
- "mul z23.s, p2/M, z23.s, z15.s\n"
- "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
- "add z23.s, z23.s, z13.s\n"
+ "udot z16.s, z30.b, z0.b[0]\n"
+ "udot z16.s, z29.b, z0.b[1]\n"
+ "add z24.s, z22.s, z16.s\n"
+ "add z26.s, z22.s, z25.s\n"
+ "mul z24.s, p2/M, z24.s, z23.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
+ "mov z16.s, #0x0\n"
+ "udot z16.s, z30.b, z0.b[2]\n"
+ "udot z16.s, z29.b, z0.b[3]\n"
+ "add z25.s, z21.s, z16.s\n"
+ "add z27.s, z21.s, z27.s\n"
+ "mul z25.s, p2/M, z25.s, z23.s\n"
+ "mul z27.s, p2/M, z27.s, z23.s\n"
+ "add z28.s, z20.s, z28.s\n"
+ "add z29.s, z19.s, z31.s\n"
+ "mul z28.s, p2/M, z28.s, z23.s\n"
+ "mul z29.s, p2/M, z29.s, z23.s\n"
+ "add z30.s, z18.s, z20.s\n"
+ "add z31.s, z17.s, z19.s\n"
+ "mul z30.s, p2/M, z30.s, z23.s\n"
+ "mul z31.s, p2/M, z31.s, z23.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
"1:" // Loop
- "udot z30.s, z8.b, z0.b[0]\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "incb x28\n"
- "udot z28.s, z8.b, z0.b[2]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "whilelt p0.s, x27, %x[n_channels]\n"
- "udot z29.s, z8.b, z3.b[0]\n"
- "whilelt p1.b, x28, x9\n"
- "udot z27.s, z8.b, z3.b[2]\n"
- "udot z26.s, z8.b, z5.b[0]\n"
- "udot z25.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z4.b[0]\n"
- "udot z23.s, z8.b, z4.b[2]\n"
+ "udot z24.s, z8.b, z0.b[0]\n"
+ "udot z25.s, z8.b, z0.b[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "udot z26.s, z8.b, z1.b[0]\n"
+ "udot z27.s, z8.b, z1.b[2]\n"
+ "incb x9\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z24.s, z9.b, z0.b[1]\n"
+ "udot z25.s, z9.b, z0.b[3]\n"
+ "whilelt p1.b, x9, x10\n"
+ "udot z26.s, z9.b, z1.b[1]\n"
+ "udot z27.s, z9.b, z1.b[3]\n"
+ "udot z28.s, z8.b, z2.b[0]\n"
+ "udot z29.s, z8.b, z2.b[2]\n"
+ "udot z30.s, z8.b, z3.b[0]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
"ld1b { z8.b }, p2/Z, [%x[params]]\n"
- "udot z30.s, z9.b, z0.b[1]\n"
- "udot z28.s, z9.b, z0.b[3]\n"
- "udot z29.s, z9.b, z3.b[1]\n"
- "udot z27.s, z9.b, z3.b[3]\n"
- "udot z26.s, z9.b, z5.b[1]\n"
- "udot z25.s, z9.b, z5.b[3]\n"
- "udot z24.s, z9.b, z4.b[1]\n"
- "udot z23.s, z9.b, z4.b[3]\n"
+ "udot z24.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "udot z28.s, z9.b, z2.b[1]\n"
+ "udot z29.s, z9.b, z2.b[3]\n"
+ "udot z30.s, z9.b, z3.b[1]\n"
+ "udot z31.s, z9.b, z3.b[3]\n"
"ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "udot z28.s, z10.b, z3.b[2]\n"
- "udot z29.s, z10.b, z5.b[0]\n"
- "udot z27.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z4.b[0]\n"
- "udot z25.s, z10.b, z4.b[2]\n"
- "udot z24.s, z10.b, z6.b[0]\n"
- "udot z23.s, z10.b, z6.b[2]\n"
+ "udot z24.s, z11.b, z1.b[1]\n"
+ "udot z25.s, z11.b, z1.b[3]\n"
+ "udot z26.s, z11.b, z2.b[1]\n"
+ "udot z27.s, z11.b, z2.b[3]\n"
+ "udot z28.s, z10.b, z3.b[0]\n"
+ "udot z29.s, z10.b, z3.b[2]\n"
+ "udot z30.s, z10.b, z4.b[0]\n"
+ "udot z31.s, z10.b, z4.b[2]\n"
"ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "udot z30.s, z11.b, z3.b[1]\n"
- "udot z28.s, z11.b, z3.b[3]\n"
- "udot z29.s, z11.b, z5.b[1]\n"
- "udot z27.s, z11.b, z5.b[3]\n"
- "udot z26.s, z11.b, z4.b[1]\n"
- "udot z25.s, z11.b, z4.b[3]\n"
- "udot z24.s, z11.b, z6.b[1]\n"
- "udot z23.s, z11.b, z6.b[3]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z25.s, z8.b, z2.b[2]\n"
+ "udot z26.s, z8.b, z3.b[0]\n"
+ "udot z27.s, z8.b, z3.b[2]\n"
+ "udot z28.s, z11.b, z3.b[1]\n"
+ "udot z29.s, z11.b, z3.b[3]\n"
+ "udot z30.s, z11.b, z4.b[1]\n"
+ "udot z31.s, z11.b, z4.b[3]\n"
"ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z9.b, z2.b[1]\n"
+ "udot z25.s, z9.b, z2.b[3]\n"
+ "udot z26.s, z9.b, z3.b[1]\n"
+ "udot z27.s, z9.b, z3.b[3]\n"
+ "udot z28.s, z8.b, z4.b[0]\n"
+ "udot z29.s, z8.b, z4.b[2]\n"
"udot z30.s, z8.b, z5.b[0]\n"
- "udot z28.s, z8.b, z5.b[2]\n"
- "udot z29.s, z8.b, z4.b[0]\n"
- "udot z27.s, z8.b, z4.b[2]\n"
- "udot z26.s, z8.b, z6.b[0]\n"
- "udot z25.s, z8.b, z6.b[2]\n"
- "udot z24.s, z8.b, z7.b[0]\n"
- "udot z23.s, z8.b, z7.b[2]\n"
+ "udot z31.s, z8.b, z5.b[2]\n"
"ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z24.s, z10.b, z3.b[0]\n"
+ "udot z25.s, z10.b, z3.b[2]\n"
+ "udot z26.s, z10.b, z4.b[0]\n"
+ "udot z27.s, z10.b, z4.b[2]\n"
+ "udot z28.s, z9.b, z4.b[1]\n"
+ "udot z29.s, z9.b, z4.b[3]\n"
"udot z30.s, z9.b, z5.b[1]\n"
- "udot z28.s, z9.b, z5.b[3]\n"
- "udot z29.s, z9.b, z4.b[1]\n"
- "udot z27.s, z9.b, z4.b[3]\n"
- "udot z26.s, z9.b, z6.b[1]\n"
- "udot z25.s, z9.b, z6.b[3]\n"
- "udot z24.s, z9.b, z7.b[1]\n"
- "udot z23.s, z9.b, z7.b[3]\n"
+ "udot z31.s, z9.b, z5.b[3]\n"
"ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "udot z30.s, z10.b, z4.b[0]\n"
- "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
- "udot z28.s, z10.b, z4.b[2]\n"
- "udot z29.s, z10.b, z6.b[0]\n"
- "udot z27.s, z10.b, z6.b[2]\n"
- "udot z26.s, z10.b, z7.b[0]\n"
- "udot z25.s, z10.b, z7.b[2]\n"
- "udot z24.s, z10.b, z1.b[0]\n"
- "udot z23.s, z10.b, z1.b[2]\n"
+ "udot z24.s, z11.b, z3.b[1]\n"
+ "udot z25.s, z11.b, z3.b[3]\n"
+ "ld1w { z12.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z26.s, z11.b, z4.b[1]\n"
+ "udot z27.s, z11.b, z4.b[3]\n"
+ "udot z28.s, z10.b, z5.b[0]\n"
+ "udot z29.s, z10.b, z5.b[2]\n"
+ "udot z30.s, z10.b, z6.b[0]\n"
+ "udot z31.s, z10.b, z6.b[2]\n"
"ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "udot z30.s, z11.b, z4.b[1]\n"
- "udot z28.s, z11.b, z4.b[3]\n"
- "udot z29.s, z11.b, z6.b[1]\n"
- "udot z27.s, z11.b, z6.b[3]\n"
- "udot z26.s, z11.b, z7.b[1]\n"
- "udot z25.s, z11.b, z7.b[3]\n"
- "udot z24.s, z11.b, z1.b[1]\n"
- "udot z23.s, z11.b, z1.b[3]\n"
+ "udot z24.s, z8.b, z4.b[0]\n"
+ "udot z25.s, z8.b, z4.b[2]\n"
+ "udot z26.s, z8.b, z5.b[0]\n"
+ "udot z27.s, z8.b, z5.b[2]\n"
+ "udot z28.s, z11.b, z5.b[1]\n"
+ "udot z29.s, z11.b, z5.b[3]\n"
+ "udot z30.s, z11.b, z6.b[1]\n"
+ "udot z31.s, z11.b, z6.b[3]\n"
"ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "udot z30.s, z8.b, z6.b[0]\n"
- "udot z28.s, z8.b, z6.b[2]\n"
- "udot z29.s, z8.b, z7.b[0]\n"
- "udot z27.s, z8.b, z7.b[2]\n"
- "udot z26.s, z8.b, z1.b[0]\n"
- "udot z25.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z23.s, z8.b, z2.b[2]\n"
+ "udot z24.s, z9.b, z4.b[1]\n"
+ "udot z25.s, z9.b, z4.b[3]\n"
+ ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
+ "udot z26.s, z9.b, z5.b[1]\n"
+ "udot z27.s, z9.b, z5.b[3]\n"
+ ".inst 0x04b17739 // sqrdmulh z25.s, z25.s, z17.s\n"
+ "udot z28.s, z8.b, z6.b[0]\n"
+ "udot z29.s, z8.b, z6.b[2]\n"
+ ".inst 0x04b1775a // sqrdmulh z26.s, z26.s, z17.s\n"
+ "udot z30.s, z8.b, z7.b[0]\n"
+ "udot z31.s, z8.b, z7.b[2]\n"
+ ".inst 0x04b1777b // sqrdmulh z27.s, z27.s, z17.s\n"
"ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
- "udot z30.s, z9.b, z6.b[1]\n"
- "udot z28.s, z9.b, z6.b[3]\n"
- "udot z29.s, z9.b, z7.b[1]\n"
- "udot z27.s, z9.b, z7.b[3]\n"
- "udot z26.s, z9.b, z1.b[1]\n"
- "udot z25.s, z9.b, z1.b[3]\n"
- "udot z24.s, z9.b, z2.b[1]\n"
- "udot z23.s, z9.b, z2.b[3]\n"
+ "udot z28.s, z9.b, z6.b[1]\n"
+ "udot z29.s, z9.b, z6.b[3]\n"
+ "and z16.d, z24.d, z19.d\n"
+ "udot z30.s, z9.b, z7.b[1]\n"
+ "udot z31.s, z9.b, z7.b[3]\n"
+ "and z18.d, z25.d, z19.d\n"
"ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
- "addvl %x[params], %x[params], #-3\n"
- ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n"
- ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
- ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
- ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n"
- ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
- "and z20.d, z30.d, z21.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "and z17.d, z27.d, z21.d\n"
- "and z16.d, z26.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "addvl %x[params], %x[params], #-3\n"
+ ".inst 0x04b1779c // sqrdmulh z28.s, z28.s, z17.s\n"
+ ".inst 0x04b177bd // sqrdmulh z29.s, z29.s, z17.s\n"
+ ".inst 0x04b177de // sqrdmulh z30.s, z30.s, z17.s\n"
+ ".inst 0x04b177ff // sqrdmulh z31.s, z31.s, z17.s\n"
+ "and z17.d, z26.d, z19.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "sqadd z30.s, z30.s, z20.s\n"
- ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ "and z16.d, z27.d, z19.d\n"
+ ".inst 0x44828a78 // srshl z24.s, p2/M, z24.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828a79 // srshl z25.s, p2/M, z25.s, z19.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828a7a // srshl z26.s, p2/M, z26.s, z19.s\n"
+ ".inst 0x44828a7b // srshl z27.s, p2/M, z27.s, z19.s\n"
+ "and z16.d, z28.d, z19.d\n"
+ "and z18.d, z29.d, z19.d\n"
+ "and z17.d, z30.d, z19.d\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
- "sqadd z28.s, z28.s, z19.s\n"
- "sqadd z29.s, z29.s, z18.s\n"
- "and z18.d, z25.d, z21.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z27.s, z27.s, z17.s\n"
- "sqadd z26.s, z26.s, z16.s\n"
- "and z17.d, z24.d, z21.d\n"
"asr z17.s, z17.s, #0x1f\n"
- "and z16.d, z23.d, z21.d\n"
- ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ "sqadd z28.s, z28.s, z16.s\n"
+ "and z16.d, z31.d, z19.d\n"
+ ".inst 0x44828a7c // srshl z28.s, p2/M, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "add z30.s, z30.s, z14.s\n"
- "sqadd z24.s, z24.s, z17.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828a7d // srshl z29.s, p2/M, z29.s, z19.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828a7e // srshl z30.s, p2/M, z30.s, z19.s\n"
+ ".inst 0x44828a7f // srshl z31.s, p2/M, z31.s, z19.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "smin z24.s, p2/M, z24.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z27.s, p2/M, z27.s, z15.s\n"
"add z28.s, z28.s, z14.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- "smin z30.s, p2/M, z30.s, z12.s\n"
"add z29.s, z29.s, z14.s\n"
- "smin z28.s, p2/M, z28.s, z12.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "smax z30.s, p2/M, z30.s, z31.s\n"
- "st1b { z30.s }, p0, [x26, x27]\n"
- "add z27.s, z27.s, z14.s\n"
- "smax z28.s, p2/M, z28.s, z31.s\n"
- "ld1w { z30.s }, p2/Z, [SP]\n"
- "smin z29.s, p2/M, z29.s, z12.s\n"
- "st1b { z28.s }, p0, [x25, x27]\n"
- "add z30.s, z30.s, z13.s\n"
- "smin z27.s, p2/M, z27.s, z12.s\n"
- "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
- "smax z29.s, p2/M, z29.s, z31.s\n"
- "st1b { z29.s }, p0, [x24, x27]\n"
- "add z28.s, z28.s, z13.s\n"
- "smax z27.s, p2/M, z27.s, z31.s\n"
- "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "st1b { z27.s }, p0, [x23, x27]\n"
- "add z29.s, z29.s, z13.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
- "add z26.s, z26.s, z14.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
- "add z25.s, z25.s, z14.s\n"
- "add z27.s, z27.s, z13.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z23.s, z23.s, z14.s\n"
- "smin z26.s, p2/M, z26.s, z12.s\n"
- "smin z25.s, p2/M, z25.s, z12.s\n"
- "smin z24.s, p2/M, z24.s, z12.s\n"
- "smin z23.s, p2/M, z23.s, z12.s\n"
- "smax z26.s, p2/M, z26.s, z31.s\n"
- "st1b { z26.s }, p0, [x22, x27]\n"
- "smax z25.s, p2/M, z25.s, z31.s\n"
- "smax z24.s, p2/M, z24.s, z31.s\n"
- "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
- "smax z23.s, p2/M, z23.s, z31.s\n"
- "st1b { z25.s }, p0, [x21, x27]\n"
- "add z26.s, z26.s, z13.s\n"
- "st1b { z24.s }, p0, [x20, x27]\n"
- "st1b { z23.s }, p0, [x19, x27]\n"
- "incw x27\n"
- "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
- "add z25.s, z25.s, z13.s\n"
- "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
- "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
- "add z24.s, z24.s, z13.s\n"
- "add z23.s, z23.s, z13.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "smin z30.s, p2/M, z30.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z15.s\n"
+ "smax z24.s, p2/M, z24.s, z13.s\n"
+ "smax z25.s, p2/M, z25.s, z13.s\n"
+ "st1b { z24.s }, p0, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z13.s\n"
+ "smax z27.s, p2/M, z27.s, z13.s\n"
+ "st1b { z25.s }, p0, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z13.s\n"
+ "smax z29.s, p2/M, z29.s, z13.s\n"
+ "st1b { z26.s }, p0, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z13.s\n"
+ "smax z31.s, p2/M, z31.s, z13.s\n"
+ "st1b { z27.s }, p0, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p0, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z12.s\n"
+ "st1b { z29.s }, p0, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z12.s\n"
+ "st1b { z30.s }, p0, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z12.s\n"
+ "st1b { z31.s }, p0, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
"b.any 1b\n"
- "addvl SP, SP, #8\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index dc8fad95fa..ff3ec0ba48 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,324 +91,316 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x8, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x16, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x15, #0x0\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x13, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z11.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z26.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z12.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z14.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x16, x8\n"
- "ld1rw { z17.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x16, x8\n"
- "ldp x11, x10, [x21, #0x0]\n"
- "mov x19, x16\n"
- "incw x19\n"
- "ldp x9, x28, [x21, #0x10]\n"
- "whilelt p1.s, x19, x8\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z10.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z10.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z15.s, z10.s, z16.s\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x8\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
+ "ld1rb { z15.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x15, x14, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x13, x12, [x24, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x23, x17\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z0.h }, p4/Z, [x16]\n"
+ "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
+ "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x16]\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "mov z26.d, z13.d\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z24.d, z13.d\n"
+ "ld1b { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z16.d, z17.d\n"
"mov z25.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "mov z23.d, z13.d\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
- "mov z9.d, z15.d\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "mov z22.d, z15.d\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- "mov z10.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "mov z24.d, z15.d\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
- ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n"
- "ldp x23, x22, [x13, #0x0]\n"
- ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n"
- "ldp x21, x20, [x13, #0x10]\n"
- ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n"
- ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n"
- "ldr x19, [x13, #0x20]\n"
- ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n"
- ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x16]\n"
- "ld1b { z30.h }, p3/Z, [x22, x16]\n"
- ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
- "ld1b { z29.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
- "ld1b { z28.h }, p3/Z, [x20, x16]\n"
- "ld1b { z27.h }, p3/Z, [x19, x16]\n"
- ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
- ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
- ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n"
+ "ld1b { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
+ "mov z9.d, z17.d\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
"1:" // Loop
".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
- "ldr x20, [x13, #0x28]\n"
- "whilelt p0.h, x15, x8\n"
- ".inst 0x448447ef // smlalt z15.s, p4/M, z31.h, z4.h\n"
- "ldr x27, [x13, #0x30]\n"
- "inch x17\n"
- ".inst 0x448343f9 // smlalb z25.s, p4/M, z31.h, z3.h\n"
- "ldr x26, [x13, #0x38]\n"
- ".inst 0x448347e9 // smlalt z9.s, p4/M, z31.h, z3.h\n"
- "ldr x25, [x13, #0x40]\n"
- ".inst 0x448143f7 // smlalb z23.s, p4/M, z31.h, z1.h\n"
- "ldr x19, [x13, #0x48]\n"
- ".inst 0x448147f6 // smlalt z22.s, p4/M, z31.h, z1.h\n"
- "ldr x24, [x13, #0x50]\n"
- ".inst 0x448043ea // smlalb z10.s, p4/M, z31.h, z0.h\n"
- "ldr x23, [x13, #0x58]\n"
- ".inst 0x448047f8 // smlalt z24.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
+ "ldr x22, [x11, #0x28]\n"
+ "ldr x27, [x11, #0x38]\n"
+ ".inst 0x448343fa // smlalb z26.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ "ldr x21, [x11, #0x30]\n"
+ "ldr x26, [x11, #0x40]\n"
".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- "ldr x22, [x13, #0x60]\n"
- ".inst 0x448047cf // smlalt z15.s, p4/M, z30.h, z0.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x16]\n"
- ".inst 0x448243b9 // smlalb z25.s, p4/M, z29.h, z2.h\n"
- "ldr x21, [x13, #0x68]\n"
- ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
- "ldr x20, [x13, #0x70]\n"
- ".inst 0x448247a9 // smlalt z9.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x27, x16]\n"
- ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
- "ldr x19, [x13, #0x78]\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ldr x20, [x11, #0x48]\n"
+ "ld1b { z30.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448243ba // smlalb z26.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x448247aa // smlalt z10.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147f0 // smlalt z16.s, p4/M, z31.h, z1.h\n"
+ "ldr x25, [x11, #0x50]\n"
+ "ldr x24, [x11, #0x58]\n"
+ ".inst 0x448043f9 // smlalb z25.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e9 // smlalt z9.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
- "ld1w { z19.s }, p2/Z, [x14]\n"
- ".inst 0x4485478f // smlalt z15.s, p4/M, z28.h, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
- "addvl x14, x14, #2\n"
- ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
- ".inst 0x44844399 // smlalb z25.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
- "uzp1 z21.s, z19.s, z16.s\n"
- "uzp2 z18.s, z19.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
- ".inst 0x44824397 // smlalb z23.s, p4/M, z28.h, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
- "addvl x12, x12, #2\n"
- ".inst 0x44824796 // smlalt z22.s, p4/M, z28.h, z2.h\n"
- ".inst 0x4481438a // smlalb z10.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814798 // smlalt z24.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x26, x16]\n"
- "uzp1 z20.s, z19.s, z16.s\n"
- "uzp2 z19.s, z19.s, z16.s\n"
- ".inst 0x448643f7 // smlalb z23.s, p4/M, z31.h, z6.h\n"
- ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
- ".inst 0x448647f6 // smlalt z22.s, p4/M, z31.h, z6.h\n"
- "ld1b { z31.h }, p3/Z, [x25, x16]\n"
+ ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ "ldr x23, [x11, #0x60]\n"
+ ".inst 0x4484439a // smlalb z26.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ "ldr x22, [x11, #0x68]\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ldr x20, [x11, #0x78]\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ ".inst 0x44814399 // smlalb z25.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
- ".inst 0x4487476f // smlalt z15.s, p4/M, z27.h, z7.h\n"
- ".inst 0x44864379 // smlalb z25.s, p4/M, z27.h, z6.h\n"
- ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
- ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n"
- ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n"
- ".inst 0x44844776 // smlalt z22.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4483436a // smlalb z10.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
+ "ld1w { z18.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z19.s, z20.s, z18.s\n"
+ ".inst 0x4486437a // smlalb z26.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476a // smlalt z10.s, p4/M, z27.h, z6.h\n"
+ "uzp2 z22.s, z20.s, z18.s\n"
+ "ld1w { z20.s }, p2/Z, [x28]\n"
+ ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647f0 // smlalt z16.s, p4/M, z31.h, z6.h\n"
+ "ld1b { z31.h }, p3/Z, [x26, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x44834379 // smlalb z25.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
+ "whilelt p0.h, x10, x17\n"
+ "inch x16\n"
".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
- ".inst 0x4481478f // smlalt z15.s, p4/M, z28.h, z1.h\n"
- ".inst 0x448843aa // smlalb z10.s, p4/M, z29.h, z8.h\n"
- ".inst 0x448847b8 // smlalt z24.s, p4/M, z29.h, z8.h\n"
- "ld1b { z29.h }, p3/Z, [x24, x16]\n"
- ".inst 0x44804399 // smlalb z25.s, p4/M, z28.h, z0.h\n"
- ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
- "ld1b { z28.h }, p3/Z, [x23, x16]\n"
+ ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
+ "ldr x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x4480439a // smlalb z26.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x4480478a // smlalt z10.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448843b9 // smlalb z25.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44844770 // smlalt z16.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x8]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
- ".inst 0x448247ef // smlalt z15.s, p4/M, z31.h, z2.h\n"
- ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
- ".inst 0x448143f9 // smlalb z25.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
- "ld1b { z31.h }, p3/Z, [x22, x16]\n"
+ ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
+ "ld1w { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "addvl x28, x28, #2\n"
+ ".inst 0x448143fa // smlalb z26.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147ea // smlalt z10.s, p4/M, z31.h, z1.h\n"
+ "ld1b { z31.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448443d9 // smlalb z25.s, p4/M, z30.h, z4.h\n"
+ "uzp1 z1.s, z20.s, z18.s\n"
".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847cf // smlalt z15.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448743d9 // smlalb z25.s, p4/M, z30.h, z7.h\n"
- ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
- ".inst 0x448747c9 // smlalt z9.s, p4/M, z30.h, z7.h\n"
- ".inst 0x448543d7 // smlalb z23.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448547d6 // smlalt z22.s, p4/M, z30.h, z5.h\n"
- ".inst 0x448443ca // smlalb z10.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448447d8 // smlalt z24.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
+ "uzp2 z27.s, z20.s, z18.s\n"
+ ".inst 0x448743da // smlalb z26.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448747ca // smlalt z10.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448547d0 // smlalt z16.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448447c9 // smlalt z9.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44824399 // smlalb z25.s, p4/M, z28.h, z2.h\n"
".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448347af // smlalt z15.s, p4/M, z29.h, z3.h\n"
- ".inst 0x448043b7 // smlalb z23.s, p4/M, z29.h, z0.h\n"
- ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
- ".inst 0x448047b6 // smlalt z22.s, p4/M, z29.h, z0.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44854399 // smlalb z25.s, p4/M, z28.h, z5.h\n"
- ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
- ".inst 0x4482438a // smlalb z10.s, p4/M, z28.h, z2.h\n"
- ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
- ".inst 0x44824798 // smlalt z24.s, p4/M, z28.h, z2.h\n"
- "ld1b { z28.h }, p3/Z, [x19, x16]\n"
- "inch x16\n"
+ ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44824789 // smlalt z9.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448543d9 // smlalb z25.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x4485439a // smlalb z26.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
- "whilelt p2.s, x16, x8\n"
- ".inst 0x448647ef // smlalt z15.s, p4/M, z31.h, z6.h\n"
- "mov x19, x16\n"
- ".inst 0x448343f7 // smlalb z23.s, p4/M, z31.h, z3.h\n"
- "incw x19\n"
- ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
- "whilelt p1.s, x19, x8\n"
- ".inst 0x448347f6 // smlalt z22.s, p4/M, z31.h, z3.h\n"
- "whilelt p3.h, x16, x8\n"
- ".inst 0x04b575ad // sqrdmulh z13.s, z13.s, z21.s\n"
- ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
- ".inst 0x448843d9 // smlalb z25.s, p4/M, z30.h, z8.h\n"
- ".inst 0x448847c9 // smlalt z9.s, p4/M, z30.h, z8.h\n"
- "and z4.d, z13.d, z20.d\n"
- "and z16.d, z15.d, z19.d\n"
- ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
- "asr z4.s, z4.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
- "sqadd z13.s, z13.s, z4.s\n"
- "sqadd z15.s, z15.s, z16.s\n"
- "and z2.d, z25.d, z20.d\n"
- "and z16.d, z9.d, z19.d\n"
- ".inst 0x448543ca // smlalb z10.s, p4/M, z30.h, z5.h\n"
- "asr z2.s, z2.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x448547d8 // smlalt z24.s, p4/M, z30.h, z5.h\n"
- "sqadd z25.s, z25.s, z2.s\n"
- "sqadd z9.s, z9.s, z16.s\n"
- ".inst 0x448743b7 // smlalb z23.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448747b6 // smlalt z22.s, p4/M, z29.h, z7.h\n"
- ".inst 0x448643aa // smlalb z10.s, p4/M, z29.h, z6.h\n"
- ".inst 0x448647b8 // smlalt z24.s, p4/M, z29.h, z6.h\n"
- ".inst 0x44884397 // smlalb z23.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44884796 // smlalt z22.s, p4/M, z28.h, z8.h\n"
- ".inst 0x4487438a // smlalb z10.s, p4/M, z28.h, z7.h\n"
- ".inst 0x44874798 // smlalt z24.s, p4/M, z28.h, z7.h\n"
- ".inst 0x04b576f7 // sqrdmulh z23.s, z23.s, z21.s\n"
- ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
- ".inst 0x04b5754a // sqrdmulh z10.s, z10.s, z21.s\n"
- ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
- "and z18.d, z23.d, z20.d\n"
- "and z0.d, z22.d, z19.d\n"
- "and z16.d, z10.d, z20.d\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "inch x8\n"
+ ".inst 0x448547c9 // smlalt z9.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
+ "and z21.d, z13.d, z1.d\n"
+ "mov x20, x8\n"
+ ".inst 0x448643b9 // smlalb z25.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ "incw x20\n"
+ ".inst 0x448747b0 // smlalt z16.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448647a9 // smlalt z9.s, p4/M, z29.h, z6.h\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x448843da // smlalb z26.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
+ "and z20.d, z17.d, z27.d\n"
+ "whilelt p1.s, x20, x17\n"
+ ".inst 0x44874399 // smlalb z25.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x448847ca // smlalt z10.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x44884790 // smlalt z16.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
+ "sqadd z13.s, z13.s, z21.s\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z26.d, z1.d\n"
+ ".inst 0x04b6754a // sqrdmulh z10.s, z10.s, z22.s\n"
+ "and z18.d, z24.d, z1.d\n"
+ ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
+ "and z21.d, z25.d, z1.d\n"
+ ".inst 0x04b67529 // sqrdmulh z9.s, z9.s, z22.s\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ ".inst 0x44829371 // srshl z17.s, p4/M, z17.s, z27.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z10.d, z27.d\n"
"asr z18.s, z18.s, #0x1f\n"
- "asr z0.s, z0.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z23.s, z23.s, z18.s\n"
- "sqadd z22.s, z22.s, z0.s\n"
- "sqadd z10.s, z10.s, z16.s\n"
- "and z16.d, z24.d, z19.d\n"
- ".inst 0x4482928d // srshl z13.s, p4/M, z13.s, z20.s\n"
- ".inst 0x4482926f // srshl z15.s, p4/M, z15.s, z19.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44829299 // srshl z25.s, p4/M, z25.s, z20.s\n"
- "add z13.s, z13.s, z12.s\n"
- "add z15.s, z15.s, z12.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- "add z25.s, z25.s, z12.s\n"
- "smin z13.s, p4/M, z13.s, z17.s\n"
- "smin z15.s, p4/M, z15.s, z17.s\n"
- "smin z25.s, p4/M, z25.s, z17.s\n"
- ".inst 0x44829269 // srshl z9.s, p4/M, z9.s, z19.s\n"
- "smax z13.s, p4/M, z13.s, z14.s\n"
- "smax z15.s, p4/M, z15.s, z14.s\n"
- "smax z25.s, p4/M, z25.s, z14.s\n"
- "add z9.s, z9.s, z12.s\n"
- ".inst 0x44829297 // srshl z23.s, p4/M, z23.s, z20.s\n"
- "trn1 z13.h, z13.h, z15.h\n"
- "st1b { z13.h }, p0, [x11, x15]\n"
- "smin z9.s, p4/M, z9.s, z17.s\n"
- ".inst 0x44829276 // srshl z22.s, p4/M, z22.s, z19.s\n"
- "add z23.s, z23.s, z12.s\n"
- ".inst 0x4482928a // srshl z10.s, p4/M, z10.s, z20.s\n"
- ".inst 0x44829278 // srshl z24.s, p4/M, z24.s, z19.s\n"
- "add z22.s, z22.s, z12.s\n"
- "smax z9.s, p4/M, z9.s, z14.s\n"
- "add z10.s, z10.s, z12.s\n"
- "add z24.s, z24.s, z12.s\n"
- "smin z23.s, p4/M, z23.s, z17.s\n"
- "trn1 z25.h, z25.h, z9.h\n"
- "st1b { z25.h }, p0, [x10, x15]\n"
- "smin z22.s, p4/M, z22.s, z17.s\n"
- "smin z10.s, p4/M, z10.s, z17.s\n"
- "smax z23.s, p4/M, z23.s, z14.s\n"
- "smin z24.s, p4/M, z24.s, z17.s\n"
- "smax z22.s, p4/M, z22.s, z14.s\n"
- "smax z10.s, p4/M, z10.s, z14.s\n"
- "smax z24.s, p4/M, z24.s, z14.s\n"
- "trn1 z23.h, z23.h, z22.h\n"
- "st1b { z23.h }, p0, [x9, x15]\n"
- "trn1 z10.h, z10.h, z24.h\n"
- "st1b { z10.h }, p0, [x28, x15]\n"
- "inch x15\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z10.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z10.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z15.s, z10.s, z16.s\n"
+ "and z22.d, z16.d, z27.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z20.d, z9.d, z27.d\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
+ "sqadd z25.s, z25.s, z21.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ "sqadd z10.s, z10.s, z2.s\n"
+ "sqadd z16.s, z16.s, z22.s\n"
+ ".inst 0x4482936a // srshl z10.s, p4/M, z10.s, z27.s\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z9.s, z9.s, z20.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x4530435a // sqxtnb z26.h, z26.s\n"
+ ".inst 0x45304318 // sqxtnb z24.h, z24.s\n"
+ ".inst 0x45304339 // sqxtnb z25.h, z25.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x4530455a // sqxtnt z26.h, z10.s\n"
+ ".inst 0x45304618 // sqxtnt z24.h, z16.s\n"
+ ".inst 0x45304539 // sqxtnt z25.h, z9.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z12.h\n"
+ "smin z13.h, p4/M, z13.h, z11.h\n"
+ "sqadd z26.h, z26.h, z14.h\n"
+ "sqadd z24.h, z24.h, z14.h\n"
+ "smax z26.h, p4/M, z26.h, z12.h\n"
+ "smax z24.h, p4/M, z24.h, z12.h\n"
+ "sqadd z25.h, z25.h, z14.h\n"
+ "smax z25.h, p4/M, z25.h, z12.h\n"
+ "smin z26.h, p4/M, z26.h, z11.h\n"
+ "st1b { z13.h }, p0, [x15, x10]\n"
+ "smin z24.h, p4/M, z24.h, z11.h\n"
+ "smin z25.h, p4/M, z25.h, z11.h\n"
+ "st1b { z26.h }, p0, [x14, x10]\n"
+ "st1b { z24.h }, p0, [x13, x10]\n"
+ "st1b { z25.h }, p0, [x12, x10]\n"
+ "ld1sb { z0.h }, p4/Z, [x16]\n"
+ "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ ".inst 0x454f1000 // ssublb z0.h, z0.b, z15.b\n"
+ ".inst 0x454f1021 // ssublb z1.h, z1.b, z15.b\n"
+ "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x454f1042 // ssublb z2.h, z2.b, z15.b\n"
+ ".inst 0x454f1063 // ssublb z3.h, z3.b, z15.b\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454f1084 // ssublb z4.h, z4.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z13.s, z17.s, z16.s\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x16]\n"
+ "ldp x24, x23, [x11, #0x0]\n"
+ "addvl x26, x26, #2\n"
+ "str x26, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x11, #0x10]\n"
+ "ldr x20, [x11, #0x20]\n"
+ "mov z26.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ld1b { z31.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z30.h }, p3/Z, [x23, x8]\n"
+ "mov z24.d, z13.d\n"
+ "mov z16.d, z17.d\n"
+ "ld1b { z29.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
"mov z25.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x17]\n"
- "mov z23.d, z13.d\n"
- "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
- "mov z9.d, z15.d\n"
- "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
- "mov z22.d, z15.d\n"
- "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
- "mov z10.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
- "mov z24.d, z15.d\n"
- "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n"
- "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
- ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n"
- "ld1sb { z8.h }, p4/Z, [x17]\n"
- ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n"
- "ldp x23, x22, [x13, #0x0]\n"
- ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n"
- "ldp x21, x20, [x13, #0x10]\n"
- ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n"
- ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n"
- "ldr x19, [x13, #0x20]\n"
- ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n"
- ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x16]\n"
- "ld1b { z30.h }, p3/Z, [x22, x16]\n"
- ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
- "ld1b { z29.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
- "ld1b { z28.h }, p3/Z, [x20, x16]\n"
- "ld1b { z27.h }, p3/Z, [x19, x16]\n"
- ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
- ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
- ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n"
+ "mov z9.d, z17.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ ".inst 0x454f10e7 // ssublb z7.h, z7.b, z15.b\n"
+ ".inst 0x454f1108 // ssublb z8.h, z8.b, z15.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 9adf100a0f..24c4bf713d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -100,356 +100,348 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ptrue p4.b\n"
- "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x6, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
"mov x7, #0x0\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x17, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z16.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z23.b }, p4/Z, [x21]\n"
"ld1rb { z12.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z14.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z17.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x6, x4\n"
- "ld1rw { z15.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x6, x4\n"
- "ldp x15, x14, [x21, #0x0]\n"
- "mov x19, x6\n"
- "incw x19\n"
- "ldp x13, x12, [x21, #0x10]\n"
- "whilelt p1.s, x19, x4\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z20.s }, p2/Z, [x19]\n"
- "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z20.s, z10.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z20.s, z20.s, z10.s\n"
- "mov z11.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x5]\n"
- "mov z9.d, z13.d\n"
- "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
- "mov z18.d, z20.d\n"
- "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
- "mov z19.d, z20.d\n"
- "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
- "mov z23.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
- "mov z21.d, z20.d\n"
- "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z14.h }, p4/Z, [x22]\n"
+ "ld1rh { z16.h }, p4/Z, [x21]\n"
+ "ld1rh { z15.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
- "inch x5, ALL, MUL #8\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- "ld1sb { z8.h }, p4/Z, [x5]\n"
".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ldp x26, x25, [x17, #0x0]\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ldp x24, x23, [x17, #0x10]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z9.d, z13.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z10.d, z17.d\n"
+ "mov z11.d, z13.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z22.d, z17.d\n"
+ "mov z21.d, z13.d\n"
+ "ld1b { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z18.d, z17.d\n"
".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ldp x22, x21, [x17, #0x20]\n"
".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ldp x20, x19, [x17, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x26, x6]\n"
- ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n"
- "ld1b { z30.h }, p3/Z, [x25, x6]\n"
- "ld1b { z29.h }, p3/Z, [x24, x6]\n"
- ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n"
- "ld1b { z28.h }, p3/Z, [x23, x6]\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- "ld1b { z27.h }, p3/Z, [x22, x6]\n"
- "ld1b { z26.h }, p3/Z, [x21, x6]\n"
- ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
- "ld1b { z25.h }, p3/Z, [x20, x6]\n"
- "ld1b { z24.h }, p3/Z, [x19, x6]\n"
- ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
- ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
- ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
- ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
"1:" // Loop
".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
- "ldr x22, [x17, #0x40]\n"
- "whilelt p0.h, x7, x4\n"
- ".inst 0x448847f4 // smlalt z20.s, p4/M, z31.h, z8.h\n"
- "ldr x21, [x17, #0x48]\n"
- "inch x5\n"
- ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
- "ldr x20, [x17, #0x50]\n"
- ".inst 0x448647f2 // smlalt z18.s, p4/M, z31.h, z6.h\n"
- "ldr x19, [x17, #0x58]\n"
- ".inst 0x448243e9 // smlalb z9.s, p4/M, z31.h, z2.h\n"
- "ldr x11, [x17, #0x60]\n"
- ".inst 0x448247f3 // smlalt z19.s, p4/M, z31.h, z2.h\n"
- "ldr x10, [x17, #0x68]\n"
- ".inst 0x448043f7 // smlalb z23.s, p4/M, z31.h, z0.h\n"
- "ldr x9, [x17, #0x70]\n"
- ".inst 0x448047f5 // smlalt z21.s, p4/M, z31.h, z0.h\n"
- "ldr x28, [x17, #0x78]\n"
+ ".inst 0x448847f1 // smlalt z17.s, p4/M, z31.h, z8.h\n"
+ "ldr x25, [x11, #0x40]\n"
+ "ldr x24, [x11, #0x48]\n"
+ ".inst 0x448643e9 // smlalb z9.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x448647ea // smlalt z10.s, p4/M, z31.h, z6.h\n"
+ "ldr x22, [x11, #0x50]\n"
+ "ldr x20, [x11, #0x58]\n"
".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
- "ldr x27, [x17, #0x80]\n"
- ".inst 0x448047d4 // smlalt z20.s, p4/M, z30.h, z0.h\n"
- "ldr x26, [x17, #0x88]\n"
- ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
- "ldr x25, [x17, #0x90]\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x21, x6]\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ldr x23, [x11, #0x78]\n"
+ "ldr x21, [x11, #0x60]\n"
+ ".inst 0x44814389 // smlalb z9.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
- "ldr x24, [x17, #0x98]\n"
- ".inst 0x448147b4 // smlalt z20.s, p4/M, z29.h, z1.h\n"
- "ld1b { z29.h }, p3/Z, [x22, x6]\n"
- ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
- "ldr x23, [x17, #0xa0]\n"
- ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
- "ldr x22, [x17, #0xa8]\n"
- ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x6]\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x448147b1 // smlalt z17.s, p4/M, z29.h, z1.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
- "ldr x20, [x17, #0xb8]\n"
- ".inst 0x44834754 // smlalt z20.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x19, x6]\n"
- ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
- "ldr x19, [x17, #0xc0]\n"
- ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
- "ld1w { z10.s }, p2/Z, [x8]\n"
+ ".inst 0x44834751 // smlalt z17.s, p4/M, z26.h, z3.h\n"
+ "ld1b { z26.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44804309 // smlalb z9.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480470a // smlalt z10.s, p4/M, z24.h, z0.h\n"
+ "ldr x22, [x11, #0x80]\n"
+ "ldr x20, [x11, #0x68]\n"
".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
- "ld1w { z22.s }, p1/Z, [x8, #1, MUL VL]\n"
- "addvl x8, x8, #2\n"
- ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
- ".inst 0x44844734 // smlalt z20.s, p4/M, z25.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x11, x6]\n"
- ".inst 0x44804712 // smlalt z18.s, p4/M, z24.h, z0.h\n"
- "uzp1 z31.s, z10.s, z22.s\n"
- "uzp2 z30.s, z10.s, z22.s\n"
- "ld1w { z10.s }, p2/Z, [x16]\n"
- ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
- "ld1w { z22.s }, p1/Z, [x16, #1, MUL VL]\n"
- "addvl x16, x16, #2\n"
+ ".inst 0x44844731 // smlalt z17.s, p4/M, z25.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
+ "ldr x21, [x11, #0x88]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824714 // smlalt z20.s, p4/M, z24.h, z2.h\n"
- "ld1b { z24.h }, p3/Z, [x9, x6]\n"
- ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
- ".inst 0x448447b2 // smlalt z18.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x10, x6]\n"
- ".inst 0x44834349 // smlalb z9.s, p4/M, z26.h, z3.h\n"
- ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
- ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- ".inst 0x44854792 // smlalt z18.s, p4/M, z28.h, z5.h\n"
- "ld1b { z28.h }, p3/Z, [x27, x6]\n"
+ ".inst 0x44824711 // smlalt z17.s, p4/M, z24.h, z2.h\n"
+ "ldr x20, [x11, #0x70]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4485478a // smlalt z10.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f6 // smlalt z22.s, p4/M, z31.h, z2.h\n"
+ "ldr x25, [x11, #0x98]\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
- ".inst 0x44854774 // smlalt z20.s, p4/M, z27.h, z5.h\n"
- ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
- ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
- ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n"
- "ld1b { z27.h }, p3/Z, [x28, x6]\n"
- ".inst 0x44834753 // smlalt z19.s, p4/M, z26.h, z3.h\n"
- "ld1b { z26.h }, p3/Z, [x26, x6]\n"
+ ".inst 0x44854771 // smlalt z17.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ "ldr x24, [x11, #0x90]\n"
+ ".inst 0x44834369 // smlalb z9.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x448043f5 // smlalb z21.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4483434b // smlalb z11.s, p4/M, z26.h, z3.h\n"
+ "ldr x23, [x11, #0xa8]\n"
+ "ldr x20, [x11, #0xa0]\n"
+ ".inst 0x44834756 // smlalt z22.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4480432b // smlalb z11.s, p4/M, z25.h, z0.h\n"
+ "ldr x22, [x11, #0xb0]\n"
+ "ldr x21, [x11, #0xb8]\n"
+ ".inst 0x44804736 // smlalt z22.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44844772 // smlalt z18.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x44814395 // smlalb z21.s, p4/M, z28.h, z1.h\n"
".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44864734 // smlalt z20.s, p4/M, z25.h, z6.h\n"
- ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
- ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
- ".inst 0x44804329 // smlalb z9.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804733 // smlalt z19.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x25, x6]\n"
- "uzp1 z0.s, z10.s, z22.s\n"
- "uzp2 z22.s, z10.s, z22.s\n"
- ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
- ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
- ".inst 0x448447b3 // smlalt z19.s, p4/M, z29.h, z4.h\n"
- "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z31.s }, p2/Z, [x27]\n"
+ ".inst 0x44864731 // smlalt z17.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x448447b6 // smlalt z22.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44874714 // smlalt z20.s, p4/M, z24.h, z7.h\n"
- ".inst 0x44814309 // smlalb z9.s, p4/M, z24.h, z1.h\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- ".inst 0x04bf75ad // sqrdmulh z13.s, z13.s, z31.s\n"
- ".inst 0x04be7694 // sqrdmulh z20.s, z20.s, z30.s\n"
- ".inst 0x44814713 // smlalt z19.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x22, x6]\n"
- ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n"
- "and z10.d, z13.d, z0.d\n"
- ".inst 0x44844775 // smlalt z21.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x6]\n"
- ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
- "asr z10.s, z10.s, #0x1f\n"
- "and z4.d, z20.d, z22.d\n"
- ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
- "sqadd z13.s, z13.s, z10.s\n"
- "asr z4.s, z4.s, #0x1f\n"
- ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
- ".inst 0x44874792 // smlalt z18.s, p4/M, z28.h, z7.h\n"
- "sqadd z20.s, z20.s, z4.s\n"
- ".inst 0x44814397 // smlalb z23.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814795 // smlalt z21.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44864329 // smlalb z9.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44864733 // smlalt z19.s, p4/M, z25.h, z6.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x6]\n"
- ".inst 0x44854357 // smlalb z23.s, p4/M, z26.h, z5.h\n"
- ".inst 0x44854755 // smlalt z21.s, p4/M, z26.h, z5.h\n"
- "ld1b { z26.h }, p3/Z, [x21, x6]\n"
- ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
- ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ "ld1w { z20.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "uzp1 z19.s, z31.s, z20.s\n"
+ ".inst 0x44874711 // smlalt z17.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x4481430b // smlalb z11.s, p4/M, z24.h, z1.h\n"
+ "uzp2 z30.s, z31.s, z20.s\n"
+ "ld1w { z31.s }, p2/Z, [x26]\n"
+ ".inst 0x44814716 // smlalt z22.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x44854752 // smlalt z18.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
+ ".inst 0x448243b5 // smlalb z21.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x448247b2 // smlalt z18.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x4486432b // smlalb z11.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
+ "ld1w { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "uzp1 z1.s, z31.s, z20.s\n"
+ ".inst 0x44874389 // smlalb z9.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x4487478a // smlalt z10.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x44864736 // smlalt z22.s, p4/M, z25.h, z6.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x44834712 // smlalt z18.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874355 // smlalb z21.s, p4/M, z26.h, z7.h\n"
+ "uzp2 z31.s, z31.s, z20.s\n"
+ "inch x17\n"
+ ".inst 0x448843a9 // smlalb z9.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847aa // smlalt z10.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x44874776 // smlalt z22.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44874752 // smlalt z18.s, p4/M, z26.h, z7.h\n"
+ "and z0.d, z13.d, z1.d\n"
+ "inch x7\n"
+ ".inst 0x4485430b // smlalb z11.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864335 // smlalb z21.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x04be7631 // sqrdmulh z17.s, z17.s, z30.s\n"
+ "mov x20, x7\n"
+ ".inst 0x44854716 // smlalt z22.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44864732 // smlalt z18.s, p4/M, z25.h, z6.h\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4488432b // smlalb z11.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x448843b5 // smlalb z21.s, p4/M, z29.h, z8.h\n"
+ "and z20.d, z17.d, z31.d\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x44884736 // smlalt z22.s, p4/M, z25.h, z8.h\n"
".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
- ".inst 0x04bf756b // sqrdmulh z11.s, z11.s, z31.s\n"
- ".inst 0x448243b7 // smlalb z23.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "whilelt p3.h, x7, x8\n"
+ "sqadd z13.s, z13.s, z0.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x4482902d // srshl z13.s, p4/M, z13.s, z1.s\n"
+ "addvl x27, x27, #2\n"
+ "and z19.d, z9.d, z1.d\n"
+ ".inst 0x04be754a // sqrdmulh z10.s, z10.s, z30.s\n"
+ "addvl x26, x26, #2\n"
+ "and z2.d, z11.d, z1.d\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "and z0.d, z21.d, z1.d\n"
".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
- ".inst 0x448247b5 // smlalt z21.s, p4/M, z29.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x19, x6]\n"
- "inch x6\n"
- "and z2.d, z11.d, z0.d\n"
- "whilelt p2.s, x6, x4\n"
- ".inst 0x44874369 // smlalb z9.s, p4/M, z27.h, z7.h\n"
- "mov x19, x6\n"
- "and z10.d, z18.d, z22.d\n"
- "incw x19\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- "whilelt p1.s, x19, x4\n"
+ "sqadd z17.s, z17.s, z20.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448293f1 // srshl z17.s, p4/M, z17.s, z31.s\n"
+ "and z3.d, z10.d, z31.d\n"
"asr z2.s, z2.s, #0x1f\n"
- "whilelt p3.h, x6, x4\n"
- "asr z10.s, z10.s, #0x1f\n"
- ".inst 0x44874773 // smlalt z19.s, p4/M, z27.h, z7.h\n"
+ "and z26.d, z22.d, z31.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z20.d, z18.d, z31.d\n"
+ "sqadd z9.s, z9.s, z19.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
"sqadd z11.s, z11.s, z2.s\n"
- "sqadd z18.s, z18.s, z10.s\n"
- ".inst 0x44854309 // smlalb z9.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44854713 // smlalt z19.s, p4/M, z24.h, z5.h\n"
- ".inst 0x44834317 // smlalb z23.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834715 // smlalt z21.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44884329 // smlalb z9.s, p4/M, z25.h, z8.h\n"
- ".inst 0x44884733 // smlalt z19.s, p4/M, z25.h, z8.h\n"
- ".inst 0x44874357 // smlalb z23.s, p4/M, z26.h, z7.h\n"
- ".inst 0x44874755 // smlalt z21.s, p4/M, z26.h, z7.h\n"
- ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
- ".inst 0x04be7673 // sqrdmulh z19.s, z19.s, z30.s\n"
- ".inst 0x44864337 // smlalb z23.s, p4/M, z25.h, z6.h\n"
- ".inst 0x44864735 // smlalt z21.s, p4/M, z25.h, z6.h\n"
- "and z10.d, z9.d, z0.d\n"
- "and z24.d, z19.d, z22.d\n"
- ".inst 0x448843b7 // smlalb z23.s, p4/M, z29.h, z8.h\n"
- "asr z10.s, z10.s, #0x1f\n"
- "asr z24.s, z24.s, #0x1f\n"
- ".inst 0x448847b5 // smlalt z21.s, p4/M, z29.h, z8.h\n"
- "sqadd z9.s, z9.s, z10.s\n"
- "sqadd z19.s, z19.s, z24.s\n"
- ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
- ".inst 0x04be76b5 // sqrdmulh z21.s, z21.s, z30.s\n"
- ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
- ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
- "and z30.d, z23.d, z0.d\n"
- "and z28.d, z21.d, z22.d\n"
- "add z13.s, z13.s, z14.s\n"
- "add z20.s, z20.s, z14.s\n"
- "asr z30.s, z30.s, #0x1f\n"
- "asr z28.s, z28.s, #0x1f\n"
- "smin z13.s, p4/M, z13.s, z15.s\n"
- "sqadd z23.s, z23.s, z30.s\n"
- "sqadd z21.s, z21.s, z28.s\n"
- "smin z20.s, p4/M, z20.s, z15.s\n"
- "smax z13.s, p4/M, z13.s, z17.s\n"
- ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
- ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
- "smax z20.s, p4/M, z20.s, z17.s\n"
- ".inst 0x44829009 // srshl z9.s, p4/M, z9.s, z0.s\n"
- "add z11.s, z11.s, z14.s\n"
- "add z18.s, z18.s, z14.s\n"
- "trn1 z13.h, z13.h, z20.h\n"
- "st1b { z13.h }, p0, [x15, x7]\n"
- "add z9.s, z9.s, z14.s\n"
- "smin z11.s, p4/M, z11.s, z15.s\n"
- "smin z18.s, p4/M, z18.s, z15.s\n"
- ".inst 0x448292d3 // srshl z19.s, p4/M, z19.s, z22.s\n"
- "smin z9.s, p4/M, z9.s, z15.s\n"
- "smax z11.s, p4/M, z11.s, z17.s\n"
- "smax z18.s, p4/M, z18.s, z17.s\n"
- "add z19.s, z19.s, z14.s\n"
- "smax z9.s, p4/M, z9.s, z17.s\n"
- ".inst 0x44829017 // srshl z23.s, p4/M, z23.s, z0.s\n"
- "trn1 z11.h, z11.h, z18.h\n"
- "st1b { z11.h }, p0, [x14, x7]\n"
- "smin z19.s, p4/M, z19.s, z15.s\n"
- ".inst 0x448292d5 // srshl z21.s, p4/M, z21.s, z22.s\n"
- "add z23.s, z23.s, z14.s\n"
- "add z21.s, z21.s, z14.s\n"
- "smax z19.s, p4/M, z19.s, z17.s\n"
- "smin z23.s, p4/M, z23.s, z15.s\n"
- "smin z21.s, p4/M, z21.s, z15.s\n"
- "trn1 z9.h, z9.h, z19.h\n"
- "st1b { z9.h }, p0, [x13, x7]\n"
- "smax z23.s, p4/M, z23.s, z17.s\n"
- "smax z21.s, p4/M, z21.s, z17.s\n"
- "trn1 z23.h, z23.h, z21.h\n"
- "st1b { z23.h }, p0, [x12, x7]\n"
- "inch x7\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z20.s }, p2/Z, [x19]\n"
- "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z13.s, z20.s, z10.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z20.s, z20.s, z10.s\n"
- "mov z11.d, z13.d\n"
- "ld1sb { z0.h }, p4/Z, [x5]\n"
- "mov z9.d, z13.d\n"
- "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
- "mov z18.d, z20.d\n"
- "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
- "mov z19.d, z20.d\n"
- "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
- "mov z23.d, z13.d\n"
- "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
- "mov z21.d, z20.d\n"
- "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+ ".inst 0x4482902b // srshl z11.s, p4/M, z11.s, z1.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z0.s\n"
+ ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x448293ea // srshl z10.s, p4/M, z10.s, z31.s\n"
+ "sqadd z22.s, z22.s, z26.s\n"
+ "sqadd z18.s, z18.s, z20.s\n"
+ ".inst 0x448293f6 // srshl z22.s, p4/M, z22.s, z31.s\n"
+ ".inst 0x448293f2 // srshl z18.s, p4/M, z18.s, z31.s\n"
+ ".inst 0x453041ad // sqxtnb z13.h, z13.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x4530416b // sqxtnb z11.h, z11.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x4530462d // sqxtnt z13.h, z17.s\n"
+ ".inst 0x45304549 // sqxtnt z9.h, z10.s\n"
+ ".inst 0x453046cb // sqxtnt z11.h, z22.s\n"
+ ".inst 0x45304655 // sqxtnt z21.h, z18.s\n"
+ "sqadd z13.h, z13.h, z14.h\n"
+ "sqadd z9.h, z9.h, z14.h\n"
+ "smax z13.h, p4/M, z13.h, z16.h\n"
+ "smax z9.h, p4/M, z9.h, z16.h\n"
+ "sqadd z11.h, z11.h, z14.h\n"
+ "sqadd z21.h, z21.h, z14.h\n"
+ "smax z11.h, p4/M, z11.h, z16.h\n"
+ "smax z21.h, p4/M, z21.h, z16.h\n"
+ "smin z13.h, p4/M, z13.h, z15.h\n"
+ "smin z9.h, p4/M, z9.h, z15.h\n"
+ "st1b { z13.h }, p0, [x16, x10]\n"
+ "smin z11.h, p4/M, z11.h, z15.h\n"
+ "smin z21.h, p4/M, z21.h, z15.h\n"
+ "st1b { z9.h }, p0, [x15, x10]\n"
+ "st1b { z11.h }, p0, [x14, x10]\n"
+ "st1b { z21.h }, p0, [x13, x10]\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
- "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
- "inch x5, ALL, MUL #8\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
- "ld1sb { z8.h }, p4/Z, [x5]\n"
".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ldp x26, x25, [x17, #0x0]\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- "ldp x24, x23, [x17, #0x10]\n"
+ "ld1w { z18.s }, p2/Z, [x12]\n"
+ "ld1w { z8.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z8.s\n"
+ "uzp2 z17.s, z18.s, z8.s\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ "ldp x9, x28, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z9.d, z13.d\n"
+ "mov z10.d, z17.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x9, x7]\n"
+ "mov z11.d, z13.d\n"
+ "mov z22.d, z17.d\n"
+ "ld1b { z30.h }, p3/Z, [x28, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x25, x7]\n"
+ "mov z21.d, z13.d\n"
+ "mov z18.d, z17.d\n"
+ "ld1b { z28.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
- "ldp x22, x21, [x17, #0x20]\n"
+ "ld1b { z26.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
- "ldp x20, x19, [x17, #0x30]\n"
- "ld1b { z31.h }, p3/Z, [x26, x6]\n"
- ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n"
- "ld1b { z30.h }, p3/Z, [x25, x6]\n"
- "ld1b { z29.h }, p3/Z, [x24, x6]\n"
- ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n"
- "ld1b { z28.h }, p3/Z, [x23, x6]\n"
- ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
- "ld1b { z27.h }, p3/Z, [x22, x6]\n"
- "ld1b { z26.h }, p3/Z, [x21, x6]\n"
- ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
- "ld1b { z25.h }, p3/Z, [x20, x6]\n"
- "ld1b { z24.h }, p3/Z, [x19, x6]\n"
- ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
- ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
- ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
- ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ "ld1b { z24.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45571bff // usublb z31.h, z31.b, z23.b\n"
+ ".inst 0x45571bde // usublb z30.h, z30.b, z23.b\n"
+ ".inst 0x45571bbd // usublb z29.h, z29.b, z23.b\n"
+ ".inst 0x45571b9c // usublb z28.h, z28.b, z23.b\n"
+ ".inst 0x45571b7b // usublb z27.h, z27.b, z23.b\n"
+ ".inst 0x45571b5a // usublb z26.h, z26.b, z23.b\n"
+ ".inst 0x45571b39 // usublb z25.h, z25.b, z23.b\n"
+ ".inst 0x45571b18 // usublb z24.h, z24.b, z23.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 9cf95e9588..9c291ae186 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,546 +111,538 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x0, #0x0\n"
+ "mov x24, x0\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ptrue p4.b\n"
- "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x2, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "mov x3, #0x0\n"
- "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z9.b }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z14.b }, p4/Z, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1rw { z17.s }, p4/Z, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1rw { z12.s }, p4/Z, [x20]\n"
- "whilelt p3.h, x2, x0\n"
- "ld1rw { z11.s }, p4/Z, [x19]\n"
- "whilelt p2.s, x2, x0\n"
- "ldp x7, x8, [x21, #0x0]\n"
- "mov x19, x2\n"
- "incw x19\n"
- "ldp x17, x16, [x21, #0x10]\n"
- "whilelt p1.s, x19, x0\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z4.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z15.s, z4.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z18.s, z4.s, z16.s\n"
- "mov z21.d, z15.d\n"
- "ld1sb { z0.h }, p4/Z, [x1]\n"
- "mov z5.d, z15.d\n"
- "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z13.d, z18.d\n"
- "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- "mov z7.d, z18.d\n"
- "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z6.d, z15.d\n"
- "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- "mov z8.d, z18.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
- "ld1b { z28.h }, p3/Z, [x25, x2]\n"
- ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n"
- "ld1b { z27.h }, p3/Z, [x24, x2]\n"
- "ld1b { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
- "ld1b { z25.h }, p3/Z, [x22, x2]\n"
- "ld1b { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- "ld1b { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
- ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x2, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z15.b }, p4/Z, [x21]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z11.h }, p4/Z, [x20]\n"
+ "ldp x3, x4, [x22, #0x0]\n"
+ "whilelt p3.h, x0, x1\n"
+ "ldp x5, x6, [x22, #0x10]\n"
+ "whilelt p2.s, x0, x1\n"
+ "whilelt p1.s, x24, x1\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x7, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1sb { z0.h }, p4/Z, [x2]\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "mov x8, #0x0\n"
+ "mov z20.d, z14.d\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z7.d, z10.d\n"
+ "mov z8.d, z14.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z16.d, z10.d\n"
+ "mov z6.d, z14.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z5.d, z10.d\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ "ld1b { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1b { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ "ld1b { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1b { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ "ld1b { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1b { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
"1:" // Loop
- ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n"
- "ldr x20, [x5, #0x50]\n"
- "whilelt p0.h, x3, x0\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ldr x19, [x5, #0x58]\n"
- ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n"
- "ldr x25, [x5, #0x60]\n"
- ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x448043a5 // smlalb z5.s, p4/M, z29.h, z0.h\n"
- "ldr x24, [x5, #0x68]\n"
- ".inst 0x448047a7 // smlalt z7.s, p4/M, z29.h, z0.h\n"
- "ldr x23, [x5, #0x70]\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x7, #0x50]\n"
+ "ld1b { z31.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x7, #0x58]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x448043a8 // smlalb z8.s, p4/M, z29.h, z0.h\n"
".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
- "ldr x22, [x5, #0x78]\n"
- ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
- "ldr x15, [x5, #0x80]\n"
- ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n"
- "ldr x21, [x5, #0x88]\n"
- ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x44814375 // smlalb z21.s, p4/M, z27.h, z1.h\n"
- "ldr x20, [x5, #0x90]\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- "ldr x19, [x5, #0x98]\n"
- ".inst 0x4481476d // smlalt z13.s, p4/M, z27.h, z1.h\n"
- "ldr x14, [x5, #0xa0]\n"
- ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
- "ldr x13, [x5, #0xa8]\n"
- ".inst 0x44814385 // smlalb z5.s, p4/M, z28.h, z1.h\n"
- "ldr x12, [x5, #0xb0]\n"
- ".inst 0x44814787 // smlalt z7.s, p4/M, z28.h, z1.h\n"
- "ldr x11, [x5, #0xb8]\n"
+ "ldr x21, [x7, #0x60]\n"
+ "ldr x20, [x7, #0x68]\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x4482436e // smlalb z14.s, p4/M, z27.h, z2.h\n"
+ "ldr x25, [x7, #0x70]\n"
+ "ldr x24, [x7, #0x78]\n"
+ ".inst 0x44804785 // smlalt z5.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44814374 // smlalb z20.s, p4/M, z27.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814388 // smlalb z8.s, p4/M, z28.h, z1.h\n"
".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
- "ldr x10, [x5, #0xc0]\n"
- ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x4482436f // smlalb z15.s, p4/M, z27.h, z2.h\n"
- "ldr x9, [x5, #0xc8]\n"
- ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x25, x2]\n"
- ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n"
- "ldr x28, [x5, #0xd0]\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- "ldr x27, [x5, #0xd8]\n"
- ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n"
- "ldr x26, [x5, #0xe0]\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- "ld1w { z19.s }, p2/Z, [x4]\n"
- ".inst 0x448242e5 // smlalb z5.s, p4/M, z23.h, z2.h\n"
- "ld1w { z16.s }, p1/Z, [x4, #1, MUL VL]\n"
- "addvl x4, x4, #2\n"
- ".inst 0x448246e7 // smlalt z7.s, p4/M, z23.h, z2.h\n"
+ "ldr x15, [x7, #0x80]\n"
+ "ldr x23, [x7, #0x88]\n"
+ ".inst 0x4482476a // smlalt z10.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44814767 // smlalt z7.s, p4/M, z27.h, z1.h\n"
+ "ld1b { z27.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ "ldr x22, [x7, #0x90]\n"
+ "ldr x21, [x7, #0x98]\n"
+ ".inst 0x448146e5 // smlalt z5.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448242e8 // smlalb z8.s, p4/M, z23.h, z2.h\n"
".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- "uzp1 z10.s, z19.s, z16.s\n"
- "uzp2 z20.s, z19.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x6]\n"
- ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n"
- "ld1w { z16.s }, p1/Z, [x6, #1, MUL VL]\n"
- "addvl x6, x6, #2\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- "ldr x25, [x5, #0xe8]\n"
- ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n"
- ".inst 0x448343e5 // smlalb z5.s, p4/M, z31.h, z3.h\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x448347e7 // smlalt z7.s, p4/M, z31.h, z3.h\n"
+ "ldr x14, [x7, #0xa0]\n"
+ "ldr x13, [x7, #0xa8]\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ "ldr x12, [x7, #0xb0]\n"
+ "ldr x20, [x7, #0xb8]\n"
+ ".inst 0x448247e5 // smlalt z5.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448343e8 // smlalb z8.s, p4/M, z31.h, z3.h\n"
".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1]\n"
- ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
- "ldr x24, [x5, #0xf0]\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x4484476d // smlalt z13.s, p4/M, z27.h, z4.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- "ldr x23, [x5, #0xf8]\n"
- ".inst 0x448443c5 // smlalb z5.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448447c7 // smlalt z7.s, p4/M, z30.h, z4.h\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ "ldr x11, [x7, #0xc0]\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z24.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448043ae // smlalb z14.s, p4/M, z29.h, z0.h\n"
+ "ldr x10, [x7, #0xc8]\n"
+ "ldr x9, [x7, #0xd0]\n"
+ ".inst 0x448347c5 // smlalt z5.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44844374 // smlalb z20.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448443c8 // smlalb z8.s, p4/M, z30.h, z4.h\n"
".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x448043af // smlalb z15.s, p4/M, z29.h, z0.h\n"
- ".inst 0x448047b2 // smlalt z18.s, p4/M, z29.h, z0.h\n"
- "uzp1 z29.s, z19.s, z16.s\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- "uzp2 z19.s, z19.s, z16.s\n"
- ".inst 0x44804395 // smlalb z21.s, p4/M, z28.h, z0.h\n"
- ".inst 0x4480478d // smlalt z13.s, p4/M, z28.h, z0.h\n"
- ".inst 0x448042c5 // smlalb z5.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448046c7 // smlalt z7.s, p4/M, z22.h, z0.h\n"
+ "ldr x28, [x7, #0xd8]\n"
+ "ldr x27, [x7, #0xe0]\n"
+ ".inst 0x448047aa // smlalt z10.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44844767 // smlalt z7.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x4481438e // smlalb z14.s, p4/M, z28.h, z1.h\n"
+ "ldr x26, [x7, #0xe8]\n"
+ "ldr x25, [x7, #0xf0]\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44804394 // smlalb z20.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448042c8 // smlalb z8.s, p4/M, z22.h, z0.h\n"
".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x4481438f // smlalb z15.s, p4/M, z28.h, z1.h\n"
- ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
- "ld1b { z28.h }, p3/Z, [x21, x2]\n"
- ".inst 0x448142f5 // smlalb z21.s, p4/M, z23.h, z1.h\n"
- "ldr x22, [x5, #0x100]\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- ".inst 0x448146ed // smlalt z13.s, p4/M, z23.h, z1.h\n"
- ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
- ".inst 0x44814325 // smlalb z5.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814727 // smlalt z7.s, p4/M, z25.h, z1.h\n"
+ "ld1w { z19.s }, p2/Z, [x17]\n"
+ "ld1w { z18.s }, p1/Z, [x17, #1, MUL VL]\n"
+ ".inst 0x4481478a // smlalt z10.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44804787 // smlalt z7.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448242ee // smlalb z14.s, p4/M, z23.h, z2.h\n"
+ "ldr x24, [x7, #0xf8]\n"
+ "uzp1 z9.s, z19.s, z18.s\n"
+ ".inst 0x44804725 // smlalt z5.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x448142f4 // smlalb z20.s, p4/M, z23.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x44814328 // smlalb z8.s, p4/M, z25.h, z1.h\n"
".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x448242ef // smlalb z15.s, p4/M, z23.h, z2.h\n"
- ".inst 0x448246f2 // smlalt z18.s, p4/M, z23.h, z2.h\n"
- "ld1b { z23.h }, p3/Z, [x15, x2]\n"
- ".inst 0x448243f5 // smlalb z21.s, p4/M, z31.h, z2.h\n"
- "ldr x21, [x5, #0x108]\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- ".inst 0x448247ed // smlalt z13.s, p4/M, z31.h, z2.h\n"
- ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
- ".inst 0x44824305 // smlalb z5.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824707 // smlalt z7.s, p4/M, z24.h, z2.h\n"
+ "uzp2 z29.s, z19.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
+ "ld1b { z23.h }, p3/Z, [x15, x0]\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x448343ee // smlalb z14.s, p4/M, z31.h, z3.h\n"
+ "ldr x23, [x7, #0x100]\n"
+ "whilelt p0.h, x8, x1\n"
+ ".inst 0x44814705 // smlalt z5.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x448243f4 // smlalb z20.s, p4/M, z31.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824308 // smlalb z8.s, p4/M, z24.h, z2.h\n"
".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x448343ef // smlalb z15.s, p4/M, z31.h, z3.h\n"
- ".inst 0x448347f2 // smlalt z18.s, p4/M, z31.h, z3.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x448343d5 // smlalb z21.s, p4/M, z30.h, z3.h\n"
- "ldr x20, [x5, #0x110]\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- ".inst 0x448347cd // smlalt z13.s, p4/M, z30.h, z3.h\n"
- ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
- ".inst 0x44834365 // smlalb z5.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834767 // smlalt z7.s, p4/M, z27.h, z3.h\n"
+ "addvl x17, x17, #2\n"
+ ".inst 0x448347ea // smlalt z10.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448443ce // smlalb z14.s, p4/M, z30.h, z4.h\n"
+ "ldr x22, [x7, #0x108]\n"
+ ".inst 0x44824765 // smlalt z5.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448343d4 // smlalb z20.s, p4/M, z30.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
- ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x448443cf // smlalb z15.s, p4/M, z30.h, z4.h\n"
- ".inst 0x448447d2 // smlalt z18.s, p4/M, z30.h, z4.h\n"
- "ld1b { z30.h }, p3/Z, [x19, x2]\n"
- ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n"
- "ldr x19, [x5, #0x118]\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n"
- "ld1b { z26.h }, p3/Z, [x14, x2]\n"
- ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
- ".inst 0x448442e5 // smlalb z5.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448446e7 // smlalt z7.s, p4/M, z23.h, z4.h\n"
- ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
+ ".inst 0x448447ca // smlalt z10.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
+ "ld1b { z30.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448042ce // smlalb z14.s, p4/M, z22.h, z0.h\n"
+ "ldr x21, [x7, #0x110]\n"
+ ".inst 0x448346e5 // smlalt z5.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x448442e8 // smlalb z8.s, p4/M, z23.h, z4.h\n"
".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x448042cf // smlalb z15.s, p4/M, z22.h, z0.h\n"
- ".inst 0x448046d2 // smlalt z18.s, p4/M, z22.h, z0.h\n"
- "ld1b { z22.h }, p3/Z, [x11, x2]\n"
- ".inst 0x44804335 // smlalb z21.s, p4/M, z25.h, z0.h\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- ".inst 0x4480472d // smlalt z13.s, p4/M, z25.h, z0.h\n"
- ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
- ".inst 0x448043e5 // smlalb z5.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047e7 // smlalt z7.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448046ca // smlalt z10.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z26.h }, p3/Z, [x14, x0]\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x4481432e // smlalb z14.s, p4/M, z25.h, z1.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
+ ".inst 0x44844785 // smlalt z5.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44804334 // smlalb z20.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x448043e8 // smlalb z8.s, p4/M, z31.h, z0.h\n"
".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
- ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4481432f // smlalb z15.s, p4/M, z25.h, z1.h\n"
- ".inst 0x44814732 // smlalt z18.s, p4/M, z25.h, z1.h\n"
- "ld1b { z25.h }, p3/Z, [x13, x2]\n"
- ".inst 0x44814315 // smlalb z21.s, p4/M, z24.h, z1.h\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- ".inst 0x4481470d // smlalt z13.s, p4/M, z24.h, z1.h\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x448143c5 // smlalb z5.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448147c7 // smlalt z7.s, p4/M, z30.h, z1.h\n"
+ "ldr x20, [x7, #0x118]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x4481472a // smlalt z10.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x13, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x4482430e // smlalb z14.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x448047c5 // smlalt z5.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x44814314 // smlalb z20.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x448143c8 // smlalb z8.s, p4/M, z30.h, z1.h\n"
".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
- ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1]\n"
- ".inst 0x4482430f // smlalb z15.s, p4/M, z24.h, z2.h\n"
- ".inst 0x44824712 // smlalt z18.s, p4/M, z24.h, z2.h\n"
- "ld1b { z24.h }, p3/Z, [x12, x2]\n"
- ".inst 0x44824375 // smlalb z21.s, p4/M, z27.h, z2.h\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- ".inst 0x4482476d // smlalt z13.s, p4/M, z27.h, z2.h\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- ".inst 0x44824345 // smlalb z5.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824747 // smlalt z7.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x4482470a // smlalt z10.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x12, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x4483436e // smlalb z14.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44814745 // smlalt z5.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44824374 // smlalb z20.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x44824348 // smlalb z8.s, p4/M, z26.h, z2.h\n"
".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
- ".inst 0x4483436f // smlalb z15.s, p4/M, z27.h, z3.h\n"
- ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n"
- "ld1b { z27.h }, p3/Z, [x10, x2]\n"
- ".inst 0x448342f5 // smlalb z21.s, p4/M, z23.h, z3.h\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- ".inst 0x448346ed // smlalt z13.s, p4/M, z23.h, z3.h\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x4483476a // smlalt z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x11, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448442ee // smlalb z14.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448342f4 // smlalb z20.s, p4/M, z23.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #1, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
- ".inst 0x448442ef // smlalb z15.s, p4/M, z23.h, z4.h\n"
- ".inst 0x448446f2 // smlalt z18.s, p4/M, z23.h, z4.h\n"
- "ld1b { z23.h }, p3/Z, [x9, x2]\n"
- ".inst 0x44844395 // smlalb z21.s, p4/M, z28.h, z4.h\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x4484478d // smlalt z13.s, p4/M, z28.h, z4.h\n"
- "ld1b { z28.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
- ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n"
- ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
+ ".inst 0x448446ea // smlalt z10.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448346e7 // smlalt z7.s, p4/M, z23.h, z3.h\n"
+ "ld1b { z23.h }, p3/Z, [x10, x0]\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448043ee // smlalb z14.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44844394 // smlalb z20.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #2, MUL VL]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
- ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
- ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n"
- ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n"
- ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
- ".inst 0x44804365 // smlalb z5.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44804767 // smlalt z7.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448047ea // smlalt z10.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448143ce // smlalb z14.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x448446c5 // smlalt z5.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448043d4 // smlalb z20.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #3, MUL VL]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804368 // smlalb z8.s, p4/M, z27.h, z0.h\n"
".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
- ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
- "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
- ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n"
- ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- ".inst 0x44814355 // smlalb z21.s, p4/M, z26.h, z1.h\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- ".inst 0x4481474d // smlalt z13.s, p4/M, z26.h, z1.h\n"
- ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
- ".inst 0x448142e5 // smlalb z5.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448147ca // smlalt z10.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448047c7 // smlalt z7.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x4482434e // smlalb z14.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448046e5 // smlalt z5.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x44814354 // smlalb z20.s, p4/M, z26.h, z1.h\n"
+ "ld1sb { z0.h }, p4/Z, [x2, #4, MUL VL]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x448142e8 // smlalb z8.s, p4/M, z23.h, z1.h\n"
".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
- "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
- ".inst 0x4482434f // smlalb z15.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
- "ld1b { z26.h }, p3/Z, [x25, x2]\n"
- ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n"
- ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
- ".inst 0x448243e5 // smlalb z5.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x4482474a // smlalt z10.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44814747 // smlalt z7.s, p4/M, z26.h, z1.h\n"
+ "ld1b { z26.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x4483432e // smlalb z14.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448147e5 // smlalt z5.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44824334 // smlalb z20.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #5, MUL VL]\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x448243e8 // smlalb z8.s, p4/M, z31.h, z2.h\n"
".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
- "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
- ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n"
- "ld1b { z25.h }, p3/Z, [x24, x2]\n"
- ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x448343c5 // smlalb z5.s, p4/M, z30.h, z3.h\n"
- ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x4483472a // smlalt z10.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44824727 // smlalt z7.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z25.h }, p3/Z, [x25, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x4484430e // smlalb z14.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448247c5 // smlalt z5.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44834314 // smlalb z20.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #6, MUL VL]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x448343c8 // smlalb z8.s, p4/M, z30.h, z3.h\n"
".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
- ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
- "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
- "inch x1, ALL, MUL #8\n"
- ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n"
- "ld1b { z24.h }, p3/Z, [x23, x2]\n"
- ".inst 0x448442d5 // smlalb z21.s, p4/M, z22.h, z4.h\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x448446cd // smlalt z13.s, p4/M, z22.h, z4.h\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- ".inst 0x44844385 // smlalb z5.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484470a // smlalt z10.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44834707 // smlalt z7.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z24.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x4480436e // smlalb z14.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44834785 // smlalt z5.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x448442d4 // smlalb z20.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #7, MUL VL]\n"
+ "inch x2, ALL, MUL #8\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
- ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
- "ld1sb { z4.h }, p4/Z, [x1]\n"
- "inch x1\n"
- ".inst 0x4480436f // smlalb z15.s, p4/M, z27.h, z0.h\n"
- ".inst 0x44804772 // smlalt z18.s, p4/M, z27.h, z0.h\n"
- "ld1b { z27.h }, p3/Z, [x22, x2]\n"
- ".inst 0x448042f5 // smlalb z21.s, p4/M, z23.h, z0.h\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- ".inst 0x448046ed // smlalt z13.s, p4/M, z23.h, z0.h\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- ".inst 0x44804325 // smlalb z5.s, p4/M, z25.h, z0.h\n"
- ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x4480476a // smlalt z10.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x448142ee // smlalb z14.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448446c7 // smlalt z7.s, p4/M, z22.h, z4.h\n"
+ "ld1w { z18.s }, p1/Z, [x16, #1, MUL VL]\n"
+ "addvl x16, x16, #2\n"
+ ".inst 0x44844745 // smlalt z5.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x448042f4 // smlalb z20.s, p4/M, z23.h, z0.h\n"
+ "ld1sb { z4.h }, p4/Z, [x2]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x44804328 // smlalb z8.s, p4/M, z25.h, z0.h\n"
".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
- ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
- ".inst 0x448142ef // smlalb z15.s, p4/M, z23.h, z1.h\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x448146f2 // smlalt z18.s, p4/M, z23.h, z1.h\n"
- ".inst 0x448143f5 // smlalb z21.s, p4/M, z31.h, z1.h\n"
- ".inst 0x448147ed // smlalt z13.s, p4/M, z31.h, z1.h\n"
- ".inst 0x44814305 // smlalb z5.s, p4/M, z24.h, z1.h\n"
- ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
- "ld1b { z24.h }, p3/Z, [x20, x2]\n"
- ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
- ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
- ".inst 0x448243ef // smlalb z15.s, p4/M, z31.h, z2.h\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- ".inst 0x448247f2 // smlalt z18.s, p4/M, z31.h, z2.h\n"
- ".inst 0x448243d5 // smlalb z21.s, p4/M, z30.h, z2.h\n"
- ".inst 0x448247cd // smlalt z13.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824365 // smlalb z5.s, p4/M, z27.h, z2.h\n"
- ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
- "ld1b { z27.h }, p3/Z, [x19, x2]\n"
"inch x2\n"
+ ".inst 0x448146ea // smlalt z10.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44804730 // smlalt z16.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448046e7 // smlalt z7.s, p4/M, z23.h, z0.h\n"
+ "uzp1 z23.s, z19.s, z18.s\n"
+ ".inst 0x44804705 // smlalt z5.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448143f4 // smlalb z20.s, p4/M, z31.h, z1.h\n"
+ "uzp2 z22.s, z19.s, z18.s\n"
+ ".inst 0x44814308 // smlalb z8.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44814710 // smlalt z16.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x21, x0]\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x448343ce // smlalb z14.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448147e7 // smlalt z7.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814765 // smlalt z5.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243d4 // smlalb z20.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824368 // smlalb z8.s, p4/M, z27.h, z2.h\n"
".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
- "whilelt p2.s, x2, x0\n"
- ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
- "mov x19, x2\n"
- ".inst 0x448343cf // smlalb z15.s, p4/M, z30.h, z3.h\n"
- "incw x19\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- "whilelt p1.s, x19, x0\n"
- ".inst 0x448347d2 // smlalt z18.s, p4/M, z30.h, z3.h\n"
- "whilelt p3.h, x2, x0\n"
- ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n"
- ".inst 0x4483478d // smlalt z13.s, p4/M, z28.h, z3.h\n"
- ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n"
- ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x448347ca // smlalt z10.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x4484438e // smlalb z14.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x448247c7 // smlalt z7.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04a975ce // sqrdmulh z14.s, z14.s, z9.s\n"
+ "inch x0\n"
+ ".inst 0x44824725 // smlalt z5.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44834394 // smlalb z20.s, p4/M, z28.h, z3.h\n"
+ "and z21.d, z14.d, z23.d\n"
+ "mov x20, x0\n"
+ ".inst 0x44834328 // smlalb z8.s, p4/M, z25.h, z3.h\n"
".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
- ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
- ".inst 0x4484438f // smlalb z15.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844792 // smlalt z18.s, p4/M, z28.h, z4.h\n"
- ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n"
- ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n"
- ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
- ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
- ".inst 0x04aa76b5 // sqrdmulh z21.s, z21.s, z10.s\n"
- ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n"
- "and z28.d, z15.d, z29.d\n"
- "and z26.d, z18.d, z19.d\n"
- "and z16.d, z21.d, z29.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "asr z26.s, z26.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z15.s, z15.s, z28.s\n"
- "sqadd z18.s, z18.s, z26.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "and z16.d, z13.d, z19.d\n"
- ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n"
- ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "incw x20\n"
+ ".inst 0x4484478a // smlalt z10.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44834787 // smlalt z7.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x04bd754a // sqrdmulh z10.s, z10.s, z29.s\n"
+ "whilelt p2.s, x0, x1\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834705 // smlalt z5.s, p4/M, z24.h, z3.h\n"
+ "and z3.d, z10.d, z22.d\n"
+ "whilelt p1.s, x20, x1\n"
+ ".inst 0x44844354 // smlalb z20.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844308 // smlalb z8.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x04a97694 // sqrdmulh z20.s, z20.s, z9.s\n"
+ "whilelt p3.h, x0, x1\n"
".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
- ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
- "sqadd z13.s, z13.s, z16.s\n"
- ".inst 0x04b474e7 // sqrdmulh z7.s, z7.s, z20.s\n"
- ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
- "and z16.d, z5.d, z29.d\n"
- ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
- "and z25.d, z7.d, z19.d\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z26.d, z6.d, z29.d\n"
- "asr z25.s, z25.s, #0x1f\n"
- "sqadd z5.s, z5.s, z16.s\n"
- "asr z26.s, z26.s, #0x1f\n"
- "sqadd z7.s, z7.s, z25.s\n"
- ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
- "sqadd z6.s, z6.s, z26.s\n"
- ".inst 0x448293af // srshl z15.s, p4/M, z15.s, z29.s\n"
- ".inst 0x44829272 // srshl z18.s, p4/M, z18.s, z19.s\n"
- "and z16.d, z8.d, z19.d\n"
- ".inst 0x448293b5 // srshl z21.s, p4/M, z21.s, z29.s\n"
- "add z15.s, z15.s, z17.s\n"
- "add z18.s, z18.s, z17.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- "add z21.s, z21.s, z17.s\n"
- "smin z15.s, p4/M, z15.s, z11.s\n"
- "sqadd z8.s, z8.s, z16.s\n"
- "smin z18.s, p4/M, z18.s, z11.s\n"
- "smin z21.s, p4/M, z21.s, z11.s\n"
- "smax z15.s, p4/M, z15.s, z12.s\n"
- ".inst 0x4482926d // srshl z13.s, p4/M, z13.s, z19.s\n"
- "smax z18.s, p4/M, z18.s, z12.s\n"
- "smax z21.s, p4/M, z21.s, z12.s\n"
- ".inst 0x448293a5 // srshl z5.s, p4/M, z5.s, z29.s\n"
- "add z13.s, z13.s, z17.s\n"
- "trn1 z15.h, z15.h, z18.h\n"
- "st1b { z15.h }, p0, [x7, x3]\n"
- "add z5.s, z5.s, z17.s\n"
- "smin z13.s, p4/M, z13.s, z11.s\n"
- ".inst 0x44829267 // srshl z7.s, p4/M, z7.s, z19.s\n"
- ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
- "smin z5.s, p4/M, z5.s, z11.s\n"
- "smax z13.s, p4/M, z13.s, z12.s\n"
- "add z7.s, z7.s, z17.s\n"
- "add z6.s, z6.s, z17.s\n"
- "smax z5.s, p4/M, z5.s, z12.s\n"
- "trn1 z21.h, z21.h, z13.h\n"
- "st1b { z21.h }, p0, [x8, x3]\n"
- "smin z7.s, p4/M, z7.s, z11.s\n"
- "smin z6.s, p4/M, z6.s, z11.s\n"
- ".inst 0x44829268 // srshl z8.s, p4/M, z8.s, z19.s\n"
- "smax z7.s, p4/M, z7.s, z12.s\n"
- "smax z6.s, p4/M, z6.s, z12.s\n"
- "add z8.s, z8.s, z17.s\n"
- "trn1 z5.h, z5.h, z7.h\n"
- "st1b { z5.h }, p0, [x17, x3]\n"
- "smin z8.s, p4/M, z8.s, z11.s\n"
- "smax z8.s, p4/M, z8.s, z12.s\n"
- "trn1 z6.h, z6.h, z8.h\n"
- "st1b { z6.h }, p0, [x16, x3]\n"
- "inch x3\n"
- "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1w { z4.s }, p2/Z, [x19]\n"
- "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
- "uzp1 z15.s, z4.s, z16.s\n"
- "addvl x19, x19, #2\n"
- "str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "uzp2 z18.s, z4.s, z16.s\n"
- "mov z21.d, z15.d\n"
- "ld1sb { z0.h }, p4/Z, [x1]\n"
- "mov z5.d, z15.d\n"
- "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
- "mov z13.d, z18.d\n"
- "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
- "mov z7.d, z18.d\n"
- "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
- "mov z6.d, z15.d\n"
- "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
- "mov z8.d, z18.d\n"
- "ldp x28, x27, [x5, #0x0]\n"
- ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
- "ldp x26, x25, [x5, #0x10]\n"
- ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
- ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
- "ldp x24, x23, [x5, #0x20]\n"
- ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
- ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
- "ldp x22, x21, [x5, #0x30]\n"
- "ldp x20, x19, [x5, #0x40]\n"
- "ld1b { z31.h }, p3/Z, [x28, x2]\n"
- ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
- "ld1b { z30.h }, p3/Z, [x27, x2]\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
- "ld1b { z28.h }, p3/Z, [x25, x2]\n"
- ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n"
- "ld1b { z27.h }, p3/Z, [x24, x2]\n"
- "ld1b { z23.h }, p3/Z, [x23, x2]\n"
- ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
- "ld1b { z25.h }, p3/Z, [x22, x2]\n"
- "ld1b { z24.h }, p3/Z, [x21, x2]\n"
- ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
- ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- "ld1b { z22.h }, p3/Z, [x19, x2]\n"
- ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
- ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
- ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
- ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
+ ".inst 0x44844747 // smlalt z7.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04a97508 // sqrdmulh z8.s, z8.s, z9.s\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844765 // smlalt z5.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04a974c6 // sqrdmulh z6.s, z6.s, z9.s\n"
+ "sqadd z14.s, z14.s, z21.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292ee // srshl z14.s, p4/M, z14.s, z23.s\n"
+ "and z19.d, z20.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ "and z18.d, z8.d, z23.d\n"
+ ".inst 0x04bd7610 // sqrdmulh z16.s, z16.s, z29.s\n"
+ "and z21.d, z6.d, z23.d\n"
+ ".inst 0x04bd74a5 // sqrdmulh z5.s, z5.s, z29.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448292ca // srshl z10.s, p4/M, z10.s, z22.s\n"
+ "and z1.d, z7.d, z22.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z2.d, z16.d, z22.d\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z3.d, z5.d, z22.d\n"
+ "sqadd z20.s, z20.s, z19.s\n"
+ ".inst 0x448292f4 // srshl z20.s, p4/M, z20.s, z23.s\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z18.s\n"
+ ".inst 0x448292e8 // srshl z8.s, p4/M, z8.s, z23.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z21.s\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "sqadd z7.s, z7.s, z1.s\n"
+ ".inst 0x448292c7 // srshl z7.s, p4/M, z7.s, z22.s\n"
+ "sqadd z16.s, z16.s, z2.s\n"
+ "sqadd z5.s, z5.s, z3.s\n"
+ ".inst 0x448292d0 // srshl z16.s, p4/M, z16.s, z22.s\n"
+ ".inst 0x448292c5 // srshl z5.s, p4/M, z5.s, z22.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x45304294 // sqxtnb z20.h, z20.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x4530454e // sqxtnt z14.h, z10.s\n"
+ ".inst 0x453044f4 // sqxtnt z20.h, z7.s\n"
+ ".inst 0x45304608 // sqxtnt z8.h, z16.s\n"
+ ".inst 0x453044a6 // sqxtnt z6.h, z5.s\n"
+ "sqadd z14.h, z14.h, z12.h\n"
+ "sqadd z20.h, z20.h, z12.h\n"
+ "smax z14.h, p4/M, z14.h, z13.h\n"
+ "smax z20.h, p4/M, z20.h, z13.h\n"
+ "sqadd z8.h, z8.h, z12.h\n"
+ "sqadd z6.h, z6.h, z12.h\n"
+ "smax z8.h, p4/M, z8.h, z13.h\n"
+ "smax z6.h, p4/M, z6.h, z13.h\n"
+ "smin z14.h, p4/M, z14.h, z11.h\n"
+ "smin z20.h, p4/M, z20.h, z11.h\n"
+ "st1b { z14.h }, p0, [x3, x8]\n"
+ "smin z8.h, p4/M, z8.h, z11.h\n"
+ "smin z6.h, p4/M, z6.h, z11.h\n"
+ "st1b { z20.h }, p0, [x4, x8]\n"
+ "st1b { z8.h }, p0, [x5, x8]\n"
+ "st1b { z6.h }, p0, [x6, x8]\n"
+ "ld1w { z30.s }, p2/Z, [x14]\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z14.s, z30.s, z16.s\n"
+ "ld1sb { z0.h }, p4/Z, [x2]\n"
+ "ld1sb { z1.h }, p4/Z, [x2, #1, MUL VL]\n"
+ "uzp2 z10.s, z30.s, z16.s\n"
+ "addvl x14, x14, #2\n"
+ "ld1sb { z2.h }, p4/Z, [x2, #2, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x2, #3, MUL VL]\n"
+ "inch x8\n"
+ "str x14, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z4.h }, p4/Z, [x2, #4, MUL VL]\n"
+ "ldp x9, x28, [x7, #0x0]\n"
+ "mov z20.d, z14.d\n"
+ "mov z7.d, z10.d\n"
+ "ldp x27, x26, [x7, #0x10]\n"
+ "ldp x25, x24, [x7, #0x20]\n"
+ "mov z8.d, z14.d\n"
+ "mov z16.d, z10.d\n"
+ "ldp x23, x22, [x7, #0x30]\n"
+ "ldp x21, x20, [x7, #0x40]\n"
+ "mov z6.d, z14.d\n"
+ "mov z5.d, z10.d\n"
+ "ld1b { z31.h }, p3/Z, [x9, x0]\n"
+ "ld1b { z30.h }, p3/Z, [x28, x0]\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ "ld1b { z29.h }, p3/Z, [x27, x0]\n"
+ "ld1b { z28.h }, p3/Z, [x26, x0]\n"
+ ".inst 0x45511042 // ssublb z2.h, z2.b, z17.b\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ "ld1b { z27.h }, p3/Z, [x25, x0]\n"
+ "ld1b { z23.h }, p3/Z, [x24, x0]\n"
+ ".inst 0x45511084 // ssublb z4.h, z4.b, z17.b\n"
+ ".inst 0x454f1bff // usublb z31.h, z31.b, z15.b\n"
+ "ld1b { z25.h }, p3/Z, [x23, x0]\n"
+ "ld1b { z24.h }, p3/Z, [x22, x0]\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ ".inst 0x454f1bbd // usublb z29.h, z29.b, z15.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x0]\n"
+ "ld1b { z22.h }, p3/Z, [x20, x0]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1b7b // usublb z27.h, z27.b, z15.b\n"
+ ".inst 0x454f1af7 // usublb z23.h, z23.b, z15.b\n"
+ ".inst 0x454f1b39 // usublb z25.h, z25.b, z15.b\n"
+ ".inst 0x454f1b18 // usublb z24.h, z24.b, z15.b\n"
+ ".inst 0x454f1b5a // usublb z26.h, z26.b, z15.b\n"
+ ".inst 0x454f1ad6 // usublb z22.h, z22.b, z15.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}