aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv/depthwise/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/depthwise/kernels')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp528
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp515
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp829
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp907
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1233
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1399
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp616
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp631
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp973
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1022
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1049
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp524
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp511
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp825
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp903
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1229
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1395
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp612
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp627
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp969
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1018
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp55
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp379
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp916
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp851
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp55
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp662
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1184
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp55
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp662
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp55
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp324
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp284
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp478
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp495
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp688
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp746
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp345
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp345
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp531
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp559
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp255
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp364
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp247
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp538
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp547
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp688
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp820
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp405
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp531
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp633
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp259
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp392
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp454
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp457
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp353
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp428
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp388
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp457
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp353
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp428
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
159 files changed, 72606 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bb43d57018
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..99f46015aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x15, #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
+ "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x12, x12, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x9, x12, x23, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x28, x9, x23, LSL #1\n"
+ "lsl x13, x13, #0x1\n"
+ "add x27, x28, x23, LSL #1\n"
+ "add x26, x13, x13\n"
+ "add x25, x26, x13\n"
+ "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x15\n" // offset *= output_tile_size
+ "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x10, x20, LSL #1\n"
+ "lsl x11, x11, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x9, x13]\n"
+ "ld1 { v10.8h }, [x12]\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q12, [x9, x26]\n"
+ "ldr q13, [x28, x13]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x27]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q16, [x14, #0x0]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x9]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr q13, [x28, x13]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x12]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q7, [x14, #0x80]\n"
+ "add x14, x14, #0xa0\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "st1 { v29.8h }, [x24]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x10, x10, #0x10\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x27]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x9]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x28]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "st1 { v29.8h }, [x24]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 31f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x23, x9, x13\n"
+ "ldr q1, [x14, #0x20]\n"
+ "add x22, x12, XZR\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x21, x12, x25\n"
+ "ldr q3, [x14, #0x40]\n"
+ "add x20, x9, x26\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x19, x28, x13\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
+ "ldr h13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "add x19, x27, XZR\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "add x19, x27, x25\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "add x19, x12, x13\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x19, x12, x26\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "add x19, x28, x26\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "add x19, x9, XZR\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x19, x9, x25\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x28, XZR\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "add x19, x28, x25\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "add x19, x27, x13\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "add x19, x27, x26\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "mov x19, x10\n"
+ "st1 { v31.s }[0], [x19], x11\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x24\n"
+ "st1 { v29.s }[0], [x19], x11\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "mov x20, x10\n"
+ "st1 { v31.h }[2], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x11\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x10\n"
+ "st1 { v31.h }[0], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x11\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "30:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "31:" // Tile loop: End
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x17, #0x1\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x16, x16, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x16, x19\n"
+ "csel x16, x16, XZR, LT\n"
+ "csel x17, x17, x21, LT\n"
+ "cmp x17, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..af83238d2e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr q13, [x22, x11]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x26, x11]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x23, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 30f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "add x26, x26, x14\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x21, x21, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x21], #0x2\n"
+ "7:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "9:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "13:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "19:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "21:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "29:" // Oddments: Store: Bit 1: End
+
+ "30:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..90db8703b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3bdd544a54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x3\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x23, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x15, x15, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x12, x15, x22, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x11, x12, x22, LSL #1\n"
+ "lsl x16, x16, #0x1\n"
+ "add x10, x11, x22, LSL #1\n"
+ "add x9, x10, x22, LSL #1\n"
+ "add x28, x16, x16\n"
+ "add x27, x28, x16\n"
+ "add x26, x27, x16\n"
+ "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x13, x13, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x25, x13, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
+ "lsl x14, x14, #0x1\n"
+ "add x22, x14, x14\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldr q9, [x11, x28]\n"
+ "ld1 { v10.8h }, [x15]\n"
+ "ldr q11, [x15, x26]\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "ldr q13, [x12, x28]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "add x23, x23, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x10]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ld1 { v10.8h }, [x15]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr q9, [x11, x28]\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldr q4, [x17, #0x50]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x26]\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "ldr q7, [x17, #0x80]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x12, x28]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "add x17, x17, #0xa0\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x13]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "st1 { v28.8h }, [x25]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "st1 { v25.8h }, [x24]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x10]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x13]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "st1 { v28.8h }, [x25]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "st1 { v25.8h }, [x24]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 49f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "add x23, x11, x28\n"
+ "ldr q1, [x17, #0x20]\n"
+ "add x22, x15, XZR\n"
+ "ldr q2, [x17, #0x30]\n"
+ "add x21, x15, x26\n"
+ "ldr q3, [x17, #0x40]\n"
+ "add x20, x9, XZR\n"
+ "ldr q4, [x17, #0x50]\n"
+ "add x19, x12, x28\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
+ "ldr h13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x9, x26\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "add x19, x11, x16\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "add x19, x15, x16\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "add x19, x15, x27\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x19, x11, x27\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x19, x12, x26\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "add x19, x10, XZR\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "add x19, x10, x28\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "add x19, x10, x26\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "add x19, x9, x16\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "add x19, x12, x16\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "add x19, x12, x27\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "add x19, x9, x27\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "add x19, x10, x16\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "add x19, x15, x28\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "add x19, x10, x27\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "add x19, x11, XZR\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "add x19, x11, x26\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "add x19, x9, x28\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "mov x19, x13\n"
+ "st1 { v31.s }[0], [x19], x14\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v30.s }[0], [x19], x14\n"
+ "mov x20, x25\n"
+ "st1 { v29.s }[0], [x19]\n"
+ "st1 { v28.s }[0], [x20], x14\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v27.s }[0], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v26.s }[0], [x20]\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v25.s }[0], [x19], x14\n"
+ "st1 { v24.s }[0], [x19], x14\n"
+ "st1 { v23.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "mov x21, x13\n"
+ "st1 { v31.h }[2], [x21], x14\n"
+ "mov x20, x25\n"
+ "st1 { v30.h }[2], [x21], x14\n"
+ "st1 { v28.h }[2], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v27.h }[2], [x20], x14\n"
+ "st1 { v26.h }[2], [x20]\n"
+ "st1 { v25.h }[2], [x19], x14\n"
+ "st1 { v24.h }[2], [x19], x14\n"
+ "st1 { v23.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x13\n"
+ "st1 { v31.h }[0], [x21], x14\n"
+ "mov x20, x25\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[0], [x21], x14\n"
+ "st1 { v28.h }[0], [x20], x14\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v27.h }[0], [x20], x14\n"
+ "st1 { v26.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x19], x14\n"
+ "st1 { v24.h }[0], [x19], x14\n"
+ "st1 { v23.h }[0], [x19]\n"
+ "48:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "49:" // Tile loop: End
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x7, #0x1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x8, x8, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x8, x19\n"
+ "csel x8, x8, XZR, LT\n"
+ "csel x7, x7, x21, LT\n"
+ "cmp x7, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..ed47c308c4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x3\n"
+ "cbz x11, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "ldr q13, [x26, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "ldr q10, [x9, x13]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x26, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "str q25, [x20, x12]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q24, [x19, x12]\n"
+ "str q23, [x22, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "str q25, [x20, x12]\n"
+ "str q24, [x19, x12]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x22, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 48f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x10], #0x2\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v13.h }[2], [x26], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x10], #0x2\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v13.h }[0], [x26], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v12.h }[2], [x25], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x25], #0x2\n"
+ "7:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "9:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.h }[2], [x10], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x10], #0x2\n"
+ "13:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "15:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x27], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v13.h }[0], [x27], #0x2\n"
+ "19:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "21:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "23:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "27:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.h }[2], [x10], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x10], #0x2\n"
+ "29:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x9], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x9], #0x2\n"
+ "31:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v13.h }[2], [x28], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x28], #0x2\n"
+ "33:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "35:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "37:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.h }[2], [x25], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x25], #0x2\n"
+ "39:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "41:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "43:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v13.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v13.h }[2], [x10], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v13.h }[0], [x10], #0x2\n"
+ "45:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[2], [x19]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "b 47f\n"
+ "46:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[0], [x19]\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "47:" // Oddments: Store: Bit 1: End
+
+ "48:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..df5328724d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..bf18469199
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1233 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x4, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x24, #0x4\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x23, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x7, #0x0\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v15.8h }, [x23]\n"
+ "add x15, x8, x22, LSL #1\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "add x14, x15, x22, LSL #1\n"
+ "lsl x6, x6, #0x1\n"
+ "add x13, x14, x22, LSL #1\n"
+ "add x12, x13, x22, LSL #1\n"
+ "add x11, x12, x22, LSL #1\n"
+ "add x10, x6, x6\n"
+ "add x9, x10, x6\n"
+ "add x28, x9, x6\n"
+ "add x27, x28, x6\n"
+ "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x24\n" // offset *= output_tile_size
+ "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x26, x16, x20, LSL #1\n"
+ "add x25, x26, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
+ "lsl x17, x17, #0x1\n"
+ "add x23, x17, x17\n"
+ "add x22, x23, x17\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x5, #0x20]\n"
+ "ldr q2, [x5, #0x30]\n"
+ "ldr q3, [x5, #0x40]\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "ldr q9, [x14, x10]\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "ldr q11, [x8, x27]\n"
+ "ldr q12, [x14, x9]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "add x7, x7, #0x10\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr q13, [x5, #0x0]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldr q0, [x5, #0x10]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x5, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x8, x27]\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr q1, [x5, #0x20]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x5, #0x70]\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x14, x9]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "ldr q3, [x5, #0x40]\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x5, #0x80]\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x5, #0x60]\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "ldr q4, [x5, #0x50]\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "st1 { v31.8h }, [x16]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "str q30, [x16, x17]\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "str q29, [x16, x23]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "st1 { v27.8h }, [x26]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q26, [x26, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q25, [x26, x23]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x26, x22]\n"
+ "add x26, x26, #0x10\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "st1 { v23.8h }, [x25]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q22, [x25, x17]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "str q21, [x25, x23]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "str q20, [x25, x22]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "st1 { v19.8h }, [x24]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "st1 { v31.8h }, [x16]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q30, [x16, x17]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q29, [x16, x23]\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "st1 { v27.8h }, [x26]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "str q26, [x26, x17]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x26, x23]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x26, x22]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "add x26, x26, #0x10\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "st1 { v23.8h }, [x25]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x25, x17]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q21, [x25, x23]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q20, [x25, x22]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "st1 { v19.8h }, [x24]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 73f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "add x22, x14, x10\n"
+ "ldr q1, [x5, #0x20]\n"
+ "add x21, x8, XZR\n"
+ "ldr q2, [x5, #0x30]\n"
+ "add x20, x8, x27\n"
+ "ldr q3, [x5, #0x40]\n"
+ "add x19, x14, x9\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x22], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x22]\n"
+ "ld1 { v10.h }[2], [x21]\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ldr h9, [x22, #0x0]\n"
+ "ldr h10, [x21, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
+ "ldr h12, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x11, XZR\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "add x19, x11, x27\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "add x19, x13, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "add x19, x8, x6\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x19, x8, x28\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "add x19, x13, x9\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "add x19, x15, XZR\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "add x19, x15, x27\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "add x19, x15, x10\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "add x19, x12, x27\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "add x19, x15, x9\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x11, x6\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "add x19, x14, x6\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "add x19, x11, x28\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "add x19, x14, x28\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "add x19, x8, x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "add x19, x13, x6\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "add x19, x8, x9\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x19, x14, XZR\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "add x19, x13, x28\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "add x19, x14, x27\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "add x19, x13, XZR\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "add x19, x12, x10\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "add x19, x13, x27\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "add x19, x11, x10\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "add x19, x12, x9\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "add x19, x11, x9\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 62f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 62f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "add x19, x15, x6\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "add x19, x15, x28\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 66f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 66f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "add x19, x12, x6\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "add x19, x12, x28\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 70f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 70f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 71f\n"
+ "mov x19, x16\n"
+ "st1 { v31.s }[0], [x19], x17\n"
+ "add x16, x16, #0x4\n"
+ "st1 { v30.s }[0], [x19], x17\n"
+ "mov x21, x26\n"
+ "st1 { v29.s }[0], [x19], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "add x26, x26, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v25.s }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v24.s }[0], [x21]\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v23.s }[0], [x20], x17\n"
+ "st1 { v22.s }[0], [x20], x17\n"
+ "st1 { v21.s }[0], [x20], x17\n"
+ "st1 { v20.s }[0], [x20]\n"
+ "st1 { v19.s }[0], [x19], x17\n"
+ "st1 { v18.s }[0], [x19], x17\n"
+ "st1 { v17.s }[0], [x19], x17\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "mov x22, x16\n"
+ "st1 { v31.h }[2], [x22], x17\n"
+ "mov x21, x26\n"
+ "st1 { v30.h }[2], [x22], x17\n"
+ "st1 { v27.h }[2], [x21], x17\n"
+ "mov x20, x25\n"
+ "st1 { v29.h }[2], [x22], x17\n"
+ "mov x19, x24\n"
+ "st1 { v28.h }[2], [x22]\n"
+ "st1 { v26.h }[2], [x21], x17\n"
+ "st1 { v25.h }[2], [x21], x17\n"
+ "st1 { v24.h }[2], [x21]\n"
+ "st1 { v23.h }[2], [x20], x17\n"
+ "st1 { v22.h }[2], [x20], x17\n"
+ "st1 { v21.h }[2], [x20], x17\n"
+ "st1 { v20.h }[2], [x20]\n"
+ "st1 { v19.h }[2], [x19], x17\n"
+ "st1 { v18.h }[2], [x19], x17\n"
+ "st1 { v17.h }[2], [x19], x17\n"
+ "st1 { v16.h }[2], [x19]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x22, x16\n"
+ "st1 { v31.h }[0], [x22], x17\n"
+ "mov x21, x26\n"
+ "mov x20, x25\n"
+ "st1 { v30.h }[0], [x22], x17\n"
+ "st1 { v27.h }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v29.h }[0], [x22], x17\n"
+ "st1 { v28.h }[0], [x22]\n"
+ "st1 { v26.h }[0], [x21], x17\n"
+ "st1 { v25.h }[0], [x21], x17\n"
+ "st1 { v24.h }[0], [x21]\n"
+ "st1 { v23.h }[0], [x20], x17\n"
+ "st1 { v22.h }[0], [x20], x17\n"
+ "st1 { v21.h }[0], [x20], x17\n"
+ "st1 { v20.h }[0], [x20]\n"
+ "st1 { v19.h }[0], [x19], x17\n"
+ "st1 { v18.h }[0], [x19], x17\n"
+ "st1 { v17.h }[0], [x19], x17\n"
+ "st1 { v16.h }[0], [x19]\n"
+ "72:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "73:" // Tile loop: End
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x4, #0x1\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x26, x26, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x19\n"
+ "csel x26, x26, XZR, LT\n"
+ "csel x4, x4, x21, LT\n"
+ "cmp x4, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..40c019a36c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v14.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x3\n"
+ "cbz x11, 3f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "ldr q13, [x15, #0x0]\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x9, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x21, x12]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q18, [x21, x12]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 72f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x10], #0x2\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ld1 { v9.h }[0], [x10], #0x2\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "7:" // Oddments: Load input (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "9:" // Oddments: Load input (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "11:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "13:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (0, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "15:" // Oddments: Load input (0, 4): Bit 1: End
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "17:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[2], [x28], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x28], #0x2\n"
+ "19:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (1, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "21:" // Oddments: Load input (1, 5): Bit 1: End
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "23:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "25:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "27:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "29:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "31:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "33:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "35:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "37:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "39:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "41:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "43:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "45:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "47:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.h }[2], [x9], #0x2\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x9], #0x2\n"
+ "49:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v10.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.h }[2], [x28], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x28], #0x2\n"
+ "51:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "add x27, x27, x14\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.h }[2], [x27], #0x2\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v11.h }[0], [x27], #0x2\n"
+ "53:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "55:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "57:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "59:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 61f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 61f\n"
+ "60:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "61:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v10.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.h }[2], [x10], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v10.h }[0], [x10], #0x2\n"
+ "63:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 65f\n"
+ "ld1 { v11.h }[2], [x9], #0x2\n"
+ "b 65f\n"
+ "64:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x9], #0x2\n"
+ "65:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v12.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.h }[2], [x28], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x28], #0x2\n"
+ "67:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "add x27, x27, x14\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v10.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 69f\n"
+ "ld1 { v10.h }[2], [x27], #0x2\n"
+ "b 69f\n"
+ "68:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v10.h }[0], [x27], #0x2\n"
+ "69:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 70f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.s }[0], [x20]\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.h }[2], [x20]\n"
+ "st1 { v16.h }[2], [x19]\n"
+ "b 71f\n"
+ "70:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.h }[0], [x20]\n"
+ "st1 { v16.h }[0], [x19]\n"
+ "71:" // Oddments: Store: Bit 1: End
+
+ "72:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ca367cc1af
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..32a6fb964c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x6, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x17, x17, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v19.8h }, [x24]\n"
+ "add x14, x17, x23, LSL #1\n"
+ "ld1r { v18.8h }, [x21]\n"
+ "add x13, x14, x23, LSL #1\n"
+ "lsl x8, x8, #0x1\n"
+ "add x12, x13, x23, LSL #1\n"
+ "add x11, x12, x23, LSL #1\n"
+ "add x10, x8, x8\n"
+ "add x9, x10, x8\n"
+ "add x28, x9, x8\n"
+ "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x15, x15, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x27, x15, x20, LSL #1\n"
+ "lsl x16, x16, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x7, #0x20]\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q4, [x7, #0x50]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "ldr q9, [x13, x10]\n"
+ "ld1 { v10.8h }, [x17]\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q12, [x17, x9]\n"
+ "ldr q13, [x17, x28]\n"
+ "ld1 { v14.8h }, [x14]\n"
+ "ldr q15, [x14, x8]\n"
+ "ldr q16, [x17, x10]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "add x17, x17, #0x10\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q17, [x7, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x17]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q0, [x7, #0x10]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x12, x9]\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x11]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr q4, [x7, #0x50]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x17, x9]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q1, [x7, #0x20]\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x17, x28]\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x11, x9]\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "st1 { v31.8h }, [x15]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x17, x10]\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "str q30, [x15, x16]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x14]\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "st1 { v29.8h }, [x27]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x14, x8]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x17, x17, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x13]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x12, x9]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x11]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x11, x9]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "st1 { v31.8h }, [x15]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "str q30, [x15, x16]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "st1 { v29.8h }, [x27]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 43f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "add x26, x13, x10\n"
+ "ldr q1, [x7, #0x20]\n"
+ "add x25, x17, XZR\n"
+ "ldr q2, [x7, #0x30]\n"
+ "add x24, x17, x8\n"
+ "ldr q3, [x7, #0x40]\n"
+ "add x23, x17, x9\n"
+ "ldr q4, [x7, #0x50]\n"
+ "add x22, x17, x28\n"
+ "ldr q5, [x7, #0x60]\n"
+ "add x21, x14, XZR\n"
+ "ldr q6, [x7, #0x70]\n"
+ "add x20, x14, x8\n"
+ "ldr q7, [x7, #0x80]\n"
+ "add x19, x17, x10\n"
+ "ldr q8, [x7, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x26], #0x4\n"
+ "ldr s10, [x25], #0x4\n"
+ "ldr s11, [x24], #0x4\n"
+ "ldr s12, [x23], #0x4\n"
+ "ldr s13, [x22], #0x4\n"
+ "ldr s14, [x21], #0x4\n"
+ "ldr s15, [x20], #0x4\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x26]\n"
+ "ld1 { v10.h }[2], [x25]\n"
+ "ld1 { v11.h }[2], [x24]\n"
+ "ld1 { v12.h }[2], [x23]\n"
+ "ld1 { v13.h }[2], [x22]\n"
+ "ld1 { v14.h }[2], [x21]\n"
+ "ld1 { v15.h }[2], [x20]\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ldr h9, [x26, #0x0]\n"
+ "ldr h10, [x25, #0x0]\n"
+ "ldr h11, [x24, #0x0]\n"
+ "ldr h12, [x23, #0x0]\n"
+ "ldr h13, [x22, #0x0]\n"
+ "ldr h14, [x21, #0x0]\n"
+ "ldr h15, [x20, #0x0]\n"
+ "ldr h16, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x14, x9\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "add x19, x14, x28\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x14, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "add x19, x13, XZR\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "add x19, x12, x8\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x19, x13, x8\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h16, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "add x19, x12, x9\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "add x19, x13, x9\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "add x19, x12, x28\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "add x19, x11, XZR\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "add x19, x13, x28\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "add x19, x11, x8\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "add x19, x12, x10\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h16, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "add x19, x11, x9\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "add x19, x11, x10\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "add x19, x11, x28\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "mov x19, x15\n"
+ "st1 { v31.s }[0], [x19], x16\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x27\n"
+ "st1 { v29.s }[0], [x19], x16\n"
+ "add x27, x27, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "mov x20, x15\n"
+ "st1 { v31.h }[2], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x16\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x15\n"
+ "st1 { v31.h }[0], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x16\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "42:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "43:" // Tile loop: End
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x6, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x6, x6, x21, LT\n"
+ "cmp x6, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..f071e21979
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v18.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr q14, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldr q15, [x20, x14]\n"
+ "ldr q16, [x19, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "ldr q9, [x26, x11]\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ldr q12, [x23, x11]\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q13, [x22, x11]\n"
+ "ldr q14, [x21, x11]\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q16, [x19, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "str q28, [x9, x28]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 42f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "ldr x24, [x16, #0x10]\n"
+ "add x26, x26, x14\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v14.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x19], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "ld1 { v14.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x19], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v12.h }[2], [x25], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x25], #0x2\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.h }[2], [x24], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v13.h }[0], [x24], #0x2\n"
+ "11:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v14.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v14.h }[2], [x23], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v14.h }[0], [x23], #0x2\n"
+ "13:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v15.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v15.h }[2], [x22], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v15.h }[0], [x22], #0x2\n"
+ "15:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "17:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v13.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v13.h }[2], [x19], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x19], #0x2\n"
+ "21:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v14.h }[2], [x25], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v14.h }[0], [x25], #0x2\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v15.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.h }[2], [x24], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v15.h }[0], [x24], #0x2\n"
+ "27:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "29:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "31:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v16.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v16.h }[2], [x21], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v16.h }[0], [x21], #0x2\n"
+ "33:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v15.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v15.h }[2], [x19], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v15.h }[0], [x19], #0x2\n"
+ "37:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "39:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "41:" // Oddments: Store: Bit 1: End
+
+ "42:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..53d2a3a8e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..ec5f97ab6d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x28, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x5, x5, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x8, x5, x23, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x17, x8, x23, LSL #1\n"
+ "lsl x4, x4, #0x1\n"
+ "add x16, x17, x23, LSL #1\n"
+ "add x15, x16, x23, LSL #1\n"
+ "add x14, x15, x23, LSL #1\n"
+ "add x13, x4, x4\n"
+ "add x12, x13, x4\n"
+ "add x11, x12, x4\n"
+ "add x10, x11, x4\n"
+ "mul x19, x28, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x7, x7, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x9, x7, x20, LSL #1\n"
+ "lsl x6, x6, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x3, #0x20]\n"
+ "ldr q2, [x3, #0x30]\n"
+ "ldr q3, [x3, #0x40]\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x3, x3, #0x60\n"
+ "ld1 { v5.8h }, [x5]\n"
+ "ldr q6, [x5, x4]\n"
+ "ld1 { v7.8h }, [x8]\n"
+ "ldr q8, [x8, x4]\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q13, [x8, x13]\n"
+ "ldr q11, [x5, x12]\n"
+ "ldr q12, [x5, x11]\n"
+ "ldr q10, [x8, x10]\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x8, x12]\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q0, [x3, #0x0]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q16, [x3, #0x140]\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x8]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x130]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q0, [x3, #0x150]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr q13, [x8, x13]\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr q1, [x3, #0x160]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x5]\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0x170]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x5, x4]\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x5, x12]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0x180]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x8, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x5, x11]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q4, [x3, #0x190]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "add x3, x3, #0x1a0\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x7]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "st1 { v29.8h }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x8, x12]\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q0, [x3, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x130]\n"
+ "add x3, x3, #0x140\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x7]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "st1 { v29.8h }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 61f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "add x28, x5, XZR\n"
+ "ldr q1, [x3, #0x20]\n"
+ "add x27, x5, x4\n"
+ "ldr q2, [x3, #0x30]\n"
+ "add x26, x8, XZR\n"
+ "ldr q3, [x3, #0x40]\n"
+ "add x25, x8, x4\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x24, x5, x13\n"
+ "add x23, x8, x13\n"
+ "add x22, x5, x12\n"
+ "add x21, x5, x11\n"
+ "add x20, x8, x10\n"
+ "add x19, x17, XZR\n"
+ "add x3, x3, #0x60\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s5, [x28], #0x4\n"
+ "ldr s6, [x27], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s8, [x25], #0x4\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s13, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v5.h }[2], [x28]\n"
+ "ld1 { v6.h }[2], [x27]\n"
+ "ld1 { v7.h }[2], [x26]\n"
+ "ld1 { v8.h }[2], [x25]\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ldr h5, [x28, #0x0]\n"
+ "ldr h6, [x27, #0x0]\n"
+ "ldr h7, [x26, #0x0]\n"
+ "ldr h8, [x25, #0x0]\n"
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h13, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
+ "ldr h14, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "add x19, x8, x12\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "add x19, x8, x11\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "add x19, x5, x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr h0, [x3, #0xc]\n"
+ "add x19, x17, x4\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr h1, [x3, #0xe]\n"
+ "add x19, x17, x13\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr h2, [x3, #0x10]\n"
+ "add x19, x17, x12\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr h3, [x3, #0x12]\n"
+ "add x19, x17, x11\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr h4, [x3, #0x14]\n"
+ "add x19, x17, x10\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s8, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v8.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr h8, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr h0, [x3, #0x16]\n"
+ "add x19, x16, XZR\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "add x19, x16, x4\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr h1, [x3, #0x18]\n"
+ "add x19, x16, x13\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr h2, [x3, #0x1a]\n"
+ "add x19, x16, x12\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x3, #0x1c]\n"
+ "add x19, x16, x11\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x3, #0x1e]\n"
+ "add x19, x16, x10\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr h0, [x3, #0x20]\n"
+ "add x19, x15, XZR\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "add x19, x15, x4\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr h1, [x3, #0x22]\n"
+ "add x19, x15, x13\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr h2, [x3, #0x24]\n"
+ "add x19, x15, x12\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr h3, [x3, #0x26]\n"
+ "add x19, x15, x11\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s8, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v8.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h8, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr h4, [x3, #0x28]\n"
+ "add x19, x15, x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr h0, [x3, #0x2a]\n"
+ "add x19, x14, XZR\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x19, x14, x4\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr h1, [x3, #0x2c]\n"
+ "add x19, x14, x13\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr h2, [x3, #0x2e]\n"
+ "add x19, x14, x12\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x3, #0x30]\n"
+ "add x19, x14, x11\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x3, #0x32]\n"
+ "add x19, x14, x10\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "mov x19, x7\n"
+ "st1 { v31.s }[0], [x19], x6\n"
+ "add x7, x7, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x9\n"
+ "st1 { v29.s }[0], [x19], x6\n"
+ "add x9, x9, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "mov x20, x7\n"
+ "st1 { v31.h }[2], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x6\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x7\n"
+ "st1 { v31.h }[0], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x6\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "60:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "61:" // Tile loop: End
+ "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x28, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x28, x28, x21, LT\n"
+ "cmp x28, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..96e1ae496e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x15, x15, #0x60\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q5, [x26, x14]\n"
+ "ldr q6, [x25, x14]\n"
+ "ldr q7, [x24, x14]\n"
+ "ldr q8, [x23, x14]\n"
+ "ldr q9, [x22, x14]\n"
+ "ldr q13, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q12, [x19, x14]\n"
+ "ldr q10, [x26, x14]\n"
+ "ldr q14, [x25, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x15, #0x70]\n"
+ "ldr q16, [x15, #0x140]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x130]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldr q0, [x15, #0x150]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr q1, [x15, #0x160]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ldr q5, [x26, x11]\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x25, x11]\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "ldr q7, [x24, x11]\n"
+ "ldr q13, [x21, x11]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x23, x11]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q11, [x20, x11]\n"
+ "ldr q12, [x19, x11]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x22, x11]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr q2, [x15, #0x170]\n"
+ "ldr q3, [x15, #0x180]\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "ldr q10, [x26, x11]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "ldr q14, [x25, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q30, [x12, x28]\n"
+ "ldr q4, [x15, #0x190]\n"
+ "add x15, x15, #0x1a0\n"
+ "str q29, [x10, x28]\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x15, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x130]\n"
+ "add x15, x15, #0x140\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 60f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x20, x20, x14\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x19, x19, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x15, x15, #0x60\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v5.s }[0], [x26], #0x4\n"
+ "ld1 { v6.s }[0], [x25], #0x4\n"
+ "ld1 { v7.s }[0], [x24], #0x4\n"
+ "ld1 { v8.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v7.h }[2], [x24], #0x2\n"
+ "ld1 { v8.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x26], #0x2\n"
+ "ld1 { v6.h }[2], [x25], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v14.h }[2], [x25], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ld1 { v5.h }[0], [x26], #0x2\n"
+ "ld1 { v6.h }[0], [x25], #0x2\n"
+ "ld1 { v7.h }[0], [x24], #0x2\n"
+ "ld1 { v8.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v14.h }[0], [x25], #0x2\n"
+ "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "add x24, x24, x14\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 5): Bit 1: Unset
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "11:" // Oddments: Load input (0, 5): Bit 1: End
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr h0, [x15, #0xc]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "add x21, x21, x14\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "13:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr h1, [x15, #0xe]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr h2, [x15, #0x10]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v9.h }[2], [x19], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v9.h }[0], [x19], #0x2\n"
+ "17:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr h3, [x15, #0x12]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x26], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v13.h }[0], [x26], #0x2\n"
+ "19:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr h4, [x15, #0x14]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v8.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v8.h }[2], [x25], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v8.h }[0], [x25], #0x2\n"
+ "21:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr h0, [x15, #0x16]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "23:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr h1, [x15, #0x18]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[2], [x22], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x22], #0x2\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr h2, [x15, #0x1a]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "29:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x15, #0x1c]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x15, #0x1e]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v14.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v14.h }[2], [x19], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v14.h }[0], [x19], #0x2\n"
+ "33:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr h0, [x15, #0x20]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "35:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v13.h }[2], [x25], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x25], #0x2\n"
+ "37:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr h1, [x15, #0x22]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "39:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr h2, [x15, #0x24]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "41:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr h3, [x15, #0x26]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "43:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr h4, [x15, #0x28]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "45:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr h0, [x15, #0x2a]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (5, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "49:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr h1, [x15, #0x2c]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x26, [x16, #0x100]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "51:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr h2, [x15, #0x2e]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ldr x25, [x16, #0x108]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "53:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x15, #0x30]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr x24, [x16, #0x110]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "55:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x15, #0x32]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr x23, [x16, #0x118]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v9.h }[2], [x23], #0x2\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v9.h }[0], [x23], #0x2\n"
+ "57:" // Oddments: Load input (5, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "59:" // Oddments: Store: Bit 1: End
+
+ "60:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3468b70f29
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_generic_output9_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8ac79f82fa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v4.8h }, [%x[minmax_vals]]\n"
+ "add x19, %x[minmax_vals], #0x2\n"
+ "mov x11, #0x0\n"
+ "ld1r { v3.8h }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x3\n"
+ "cbz x10, 5f\n"
+ "1:" // Channel loop
+ "movi v25.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q25, [%x[bias], x11]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v24.16b, v25.16b\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v22.16b, v25.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v21.16b, v25.16b\n"
+ "ldr q2, [x9, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "add %x[params], %x[params], #0x10\n"
+ "mov v19.16b, v25.16b\n"
+ "ldr q1, [x28, x11]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q0, [x27, x11]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ldr q28, [x23, x11]\n"
+ "ldr q27, [x22, x11]\n"
+ "ldr x21, [x20], #0x8\n"
+ "ldr q26, [x21, x11]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "fmla v25.8h, v2.8h, v23.8h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "fmla v24.8h, v1.8h, v23.8h\n"
+ "ldr q2, [x9, x11]\n"
+ "fmla v22.8h, v0.8h, v23.8h\n"
+ "fmla v21.8h, v31.8h, v23.8h\n"
+ "ldr q1, [x28, x11]\n"
+ "fmla v20.8h, v30.8h, v23.8h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "fmla v19.8h, v29.8h, v23.8h\n"
+ "fmla v18.8h, v28.8h, v23.8h\n"
+ "ldr q0, [x27, x11]\n"
+ "fmla v17.8h, v27.8h, v23.8h\n"
+ "fmla v16.8h, v26.8h, v23.8h\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ldr q28, [x23, x11]\n"
+ "ldr q27, [x22, x11]\n"
+ "ldr x21, [x20], #0x8\n"
+ "ldr q26, [x21, x11]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla v25.8h, v2.8h, v23.8h\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "fmla v24.8h, v1.8h, v23.8h\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "fmla v22.8h, v0.8h, v23.8h\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "fmla v21.8h, v31.8h, v23.8h\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "fmla v20.8h, v30.8h, v23.8h\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmla v19.8h, v29.8h, v23.8h\n"
+ "fmla v18.8h, v28.8h, v23.8h\n"
+ "fmla v17.8h, v27.8h, v23.8h\n"
+ "fmla v16.8h, v26.8h, v23.8h\n"
+ "fmax v25.8h, v25.8h, v4.8h\n"
+ "fmax v24.8h, v24.8h, v4.8h\n"
+ "fmax v22.8h, v22.8h, v4.8h\n"
+ "fmin v25.8h, v25.8h, v3.8h\n"
+ "str q25, [x27, x11]\n"
+ "fmin v24.8h, v24.8h, v3.8h\n"
+ "fmin v22.8h, v22.8h, v3.8h\n"
+ "str q24, [x26, x11]\n"
+ "fmax v21.8h, v21.8h, v4.8h\n"
+ "fmax v20.8h, v20.8h, v4.8h\n"
+ "str q22, [x25, x11]\n"
+ "fmax v19.8h, v19.8h, v4.8h\n"
+ "fmax v18.8h, v18.8h, v4.8h\n"
+ "fmin v21.8h, v21.8h, v3.8h\n"
+ "str q21, [x24, x11]\n"
+ "fmin v20.8h, v20.8h, v3.8h\n"
+ "fmin v19.8h, v19.8h, v3.8h\n"
+ "str q20, [x23, x11]\n"
+ "fmin v18.8h, v18.8h, v3.8h\n"
+ "fmax v17.8h, v17.8h, v4.8h\n"
+ "str q19, [x22, x11]\n"
+ "fmax v16.8h, v16.8h, v4.8h\n"
+ "str q18, [x21, x11]\n"
+ "fmin v17.8h, v17.8h, v3.8h\n"
+ "fmin v16.8h, v16.8h, v3.8h\n"
+ "str q17, [x20, x11]\n"
+ "str q16, [x19, x11]\n"
+ "add x11, x11, #0x10\n"
+ "cmp x11, x10, LSL #4\n"
+ "blt 1b\n"
+ "5:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 25f\n"
+ "movi v25.16b, #0x0\n"
+ "cbz %x[bias], 10f\n"
+ "add x19, %x[bias], x11\n"
+ "tbz %x[n_channels], #2, 7f\n"
+ "ld1 { v25.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v25.s }[2], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v25.h }[6], [x19], #0x2\n"
+ "b 9f\n"
+ "6:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v25.h }[4], [x19], #0x2\n"
+ "b 9f\n"
+ "7:" // Oddments: Load bias: Bit 2: Unset
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v25.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v25.h }[2], [x19], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v25.h }[0], [x19], #0x2\n"
+ "9:" // Oddments: Load bias: Bit 2: End
+
+ "10:" // Oddments: Load bias: Done
+ "mov v24.16b, v25.16b\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v22.16b, v25.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add %x[params], %x[params], #0x10\n"
+ "mov v21.16b, v25.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v20.16b, v25.16b\n"
+ "add x9, x9, x11\n"
+ "mov v19.16b, v25.16b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "mov v18.16b, v25.16b\n"
+ "add x28, x28, x11\n"
+ "mov v17.16b, v25.16b\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "mov v16.16b, v25.16b\n"
+ "add x27, x27, x11\n"
+ "ldr x21, [x20], #0x8\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #2, 12f\n"
+ "ldr d2, [x9], #0x8\n"
+ "ldr d1, [x28], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d30, [x25], #0x8\n"
+ "ldr d29, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ld1 { v2.s }[2], [x9], #0x4\n"
+ "ld1 { v1.s }[2], [x28], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v29.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v2.h }[6], [x9], #0x2\n"
+ "ld1 { v1.h }[6], [x28], #0x2\n"
+ "ld1 { v0.h }[6], [x27], #0x2\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v30.h }[6], [x25], #0x2\n"
+ "ld1 { v29.h }[6], [x24], #0x2\n"
+ "ld1 { v28.h }[6], [x23], #0x2\n"
+ "ld1 { v27.h }[6], [x22], #0x2\n"
+ "ld1 { v26.h }[6], [x21], #0x2\n"
+ "b 14f\n"
+ "11:" // Oddments: Load: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v2.h }[4], [x9], #0x2\n"
+ "ld1 { v1.h }[4], [x28], #0x2\n"
+ "ld1 { v0.h }[4], [x27], #0x2\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v30.h }[4], [x25], #0x2\n"
+ "ld1 { v29.h }[4], [x24], #0x2\n"
+ "ld1 { v28.h }[4], [x23], #0x2\n"
+ "ld1 { v27.h }[4], [x22], #0x2\n"
+ "ld1 { v26.h }[4], [x21], #0x2\n"
+ "b 14f\n"
+ "12:" // Oddments: Load: Bit 2: Unset
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s2, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s30, [x25], #0x4\n"
+ "ldr s29, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v2.h }[2], [x9], #0x2\n"
+ "ld1 { v1.h }[2], [x28], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "b 14f\n"
+ "13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ldr h2, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h0, [x27], #0x2\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h30, [x25], #0x2\n"
+ "ldr h29, [x24], #0x2\n"
+ "ldr h28, [x23], #0x2\n"
+ "ldr h27, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "14:" // Oddments: Load: Bit 2: End
+ "subs x19, %x[n_points], #0x1\n"
+ "ble 20f\n"
+ "15:" // Oddments: Planar loop
+ "fmla v25.8h, v2.8h, v23.8h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "fmla v24.8h, v1.8h, v23.8h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "fmla v22.8h, v0.8h, v23.8h\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "fmla v21.8h, v31.8h, v23.8h\n"
+ "add x28, x28, x11\n"
+ "fmla v20.8h, v30.8h, v23.8h\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "fmla v19.8h, v29.8h, v23.8h\n"
+ "add x27, x27, x11\n"
+ "fmla v18.8h, v28.8h, v23.8h\n"
+ "ldr x21, [x20], #0x8\n"
+ "fmla v17.8h, v27.8h, v23.8h\n"
+ "add x26, x26, x11\n"
+ "fmla v16.8h, v26.8h, v23.8h\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr d2, [x9], #0x8\n"
+ "ldr d1, [x28], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d30, [x25], #0x8\n"
+ "ldr d29, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v2.s }[2], [x9], #0x4\n"
+ "ld1 { v1.s }[2], [x28], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v29.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v2.h }[6], [x9], #0x2\n"
+ "ld1 { v1.h }[6], [x28], #0x2\n"
+ "ld1 { v0.h }[6], [x27], #0x2\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v30.h }[6], [x25], #0x2\n"
+ "ld1 { v29.h }[6], [x24], #0x2\n"
+ "ld1 { v28.h }[6], [x23], #0x2\n"
+ "ld1 { v27.h }[6], [x22], #0x2\n"
+ "ld1 { v26.h }[6], [x21], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v2.h }[4], [x9], #0x2\n"
+ "ld1 { v1.h }[4], [x28], #0x2\n"
+ "ld1 { v0.h }[4], [x27], #0x2\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v30.h }[4], [x25], #0x2\n"
+ "ld1 { v29.h }[4], [x24], #0x2\n"
+ "ld1 { v28.h }[4], [x23], #0x2\n"
+ "ld1 { v27.h }[4], [x22], #0x2\n"
+ "ld1 { v26.h }[4], [x21], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Planar loop: Load: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr s2, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s30, [x25], #0x4\n"
+ "ldr s29, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v2.h }[2], [x9], #0x2\n"
+ "ld1 { v1.h }[2], [x28], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ldr h2, [x9], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h0, [x27], #0x2\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h30, [x25], #0x2\n"
+ "ldr h29, [x24], #0x2\n"
+ "ldr h28, [x23], #0x2\n"
+ "ldr h27, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "19:" // Oddments: Planar loop: Load: Bit 2: End
+ "subs x19, x19, #0x1\n"
+ "bgt 15b\n"
+ "20:" // Oddments: Planar tail
+ "fmla v25.8h, v2.8h, v23.8h\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "add x27, x27, x11\n"
+ "fmla v24.8h, v1.8h, v23.8h\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "fmla v22.8h, v0.8h, v23.8h\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "add x26, x26, x11\n"
+ "fmla v21.8h, v31.8h, v23.8h\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "fmla v20.8h, v30.8h, v23.8h\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x25, x25, x11\n"
+ "fmla v19.8h, v29.8h, v23.8h\n"
+ "add x24, x24, x11\n"
+ "fmla v18.8h, v28.8h, v23.8h\n"
+ "add x23, x23, x11\n"
+ "fmla v17.8h, v27.8h, v23.8h\n"
+ "add x22, x22, x11\n"
+ "fmla v16.8h, v26.8h, v23.8h\n"
+ "add x21, x21, x11\n"
+ "fmax v25.8h, v25.8h, v4.8h\n"
+ "add x20, x20, x11\n"
+ "fmax v24.8h, v24.8h, v4.8h\n"
+ "add x19, x19, x11\n"
+ "fmax v22.8h, v22.8h, v4.8h\n"
+ "fmin v25.8h, v25.8h, v3.8h\n"
+ "fmin v24.8h, v24.8h, v3.8h\n"
+ "fmin v22.8h, v22.8h, v3.8h\n"
+ "fmax v21.8h, v21.8h, v4.8h\n"
+ "fmax v20.8h, v20.8h, v4.8h\n"
+ "fmax v19.8h, v19.8h, v4.8h\n"
+ "fmin v21.8h, v21.8h, v3.8h\n"
+ "fmin v20.8h, v20.8h, v3.8h\n"
+ "fmin v19.8h, v19.8h, v3.8h\n"
+ "fmax v18.8h, v18.8h, v4.8h\n"
+ "fmax v17.8h, v17.8h, v4.8h\n"
+ "fmax v16.8h, v16.8h, v4.8h\n"
+ "fmin v18.8h, v18.8h, v3.8h\n"
+ "fmin v17.8h, v17.8h, v3.8h\n"
+ "fmin v16.8h, v16.8h, v3.8h\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "st1 { v25.d }[0], [x27], #0x8\n"
+ "st1 { v24.d }[0], [x26], #0x8\n"
+ "st1 { v22.d }[0], [x25], #0x8\n"
+ "st1 { v21.d }[0], [x24], #0x8\n"
+ "st1 { v20.d }[0], [x23], #0x8\n"
+ "st1 { v19.d }[0], [x22], #0x8\n"
+ "st1 { v18.d }[0], [x21], #0x8\n"
+ "st1 { v17.d }[0], [x20], #0x8\n"
+ "st1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "st1 { v25.s }[2], [x27], #0x4\n"
+ "st1 { v24.s }[2], [x26], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v21.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v19.s }[2], [x22], #0x4\n"
+ "st1 { v18.s }[2], [x21], #0x4\n"
+ "st1 { v17.s }[2], [x20], #0x4\n"
+ "st1 { v16.s }[2], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v25.h }[6], [x27], #0x2\n"
+ "st1 { v24.h }[6], [x26], #0x2\n"
+ "st1 { v22.h }[6], [x25], #0x2\n"
+ "st1 { v21.h }[6], [x24], #0x2\n"
+ "st1 { v20.h }[6], [x23], #0x2\n"
+ "st1 { v19.h }[6], [x22], #0x2\n"
+ "st1 { v18.h }[6], [x21], #0x2\n"
+ "st1 { v17.h }[6], [x20], #0x2\n"
+ "st1 { v16.h }[6], [x19], #0x2\n"
+ "b 24f\n"
+ "21:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v25.h }[4], [x27], #0x2\n"
+ "st1 { v24.h }[4], [x26], #0x2\n"
+ "st1 { v22.h }[4], [x25], #0x2\n"
+ "st1 { v21.h }[4], [x24], #0x2\n"
+ "st1 { v20.h }[4], [x23], #0x2\n"
+ "st1 { v19.h }[4], [x22], #0x2\n"
+ "st1 { v18.h }[4], [x21], #0x2\n"
+ "st1 { v17.h }[4], [x20], #0x2\n"
+ "st1 { v16.h }[4], [x19], #0x2\n"
+ "b 24f\n"
+ "22:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "st1 { v25.s }[0], [x27], #0x4\n"
+ "st1 { v24.s }[0], [x26], #0x4\n"
+ "st1 { v22.s }[0], [x25], #0x4\n"
+ "st1 { v21.s }[0], [x24], #0x4\n"
+ "st1 { v20.s }[0], [x23], #0x4\n"
+ "st1 { v19.s }[0], [x22], #0x4\n"
+ "st1 { v18.s }[0], [x21], #0x4\n"
+ "st1 { v17.s }[0], [x20], #0x4\n"
+ "st1 { v16.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v25.h }[2], [x27], #0x2\n"
+ "st1 { v24.h }[2], [x26], #0x2\n"
+ "st1 { v22.h }[2], [x25], #0x2\n"
+ "st1 { v21.h }[2], [x24], #0x2\n"
+ "st1 { v20.h }[2], [x23], #0x2\n"
+ "st1 { v19.h }[2], [x22], #0x2\n"
+ "st1 { v18.h }[2], [x21], #0x2\n"
+ "st1 { v17.h }[2], [x20], #0x2\n"
+ "st1 { v16.h }[2], [x19], #0x2\n"
+ "b 24f\n"
+ "23:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v25.h }[0], [x27], #0x2\n"
+ "st1 { v24.h }[0], [x26], #0x2\n"
+ "st1 { v22.h }[0], [x25], #0x2\n"
+ "st1 { v21.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x23], #0x2\n"
+ "st1 { v19.h }[0], [x22], #0x2\n"
+ "st1 { v18.h }[0], [x21], #0x2\n"
+ "st1 { v17.h }[0], [x20], #0x2\n"
+ "st1 { v16.h }[0], [x19], #0x2\n"
+ "24:" // Oddments: Store: Bit 2: End
+
+ "25:" // End
+
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..a02a2b2984
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 1; };
+
+ kern_type kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7ed7c52db2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const __fp16 *weights,
+ const __fp16 *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v7.8h }, [%x[minmax_vals]]\n"
+ "mov x10, #0x0\n"
+ "add x19, %x[minmax_vals], #0x2\n"
+ "ld1r { v6.8h }, [x19]\n"
+ "lsr x9, %x[n_output_channels], #0x3\n"
+ "cbz x9, 8f\n"
+ "1:" // Output channel loop
+ "movi v16.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x10, #0x1\n"
+ "ldr q16, [%x[bias], x19]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v5.16b, v16.16b\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "mov x19, %x[inptrs]\n"
+ "mov v31.16b, v16.16b\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "mov v30.16b, v16.16b\n"
+ "ldr q3, [x25, #0x0]\n"
+ "mov v29.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v28.16b, v16.16b\n"
+ "ldr q2, [x28, #0x0]\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "mov v18.16b, v16.16b\n"
+ "mov v17.16b, v16.16b\n"
+ "cbz x20, 6f\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "ldr q16, [%x[weights], #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q1, [x25, #0x0]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "ldr q2, [x28, #0x0]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "ldr q1, [x25, #0x0]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "ldr q16, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "lsl x27, x10, #0x1\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "fmin v5.8h, v5.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmax v5.8h, v5.8h, v7.8h\n"
+ "str q5, [x19, x27]\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q31, [x20, x27]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q30, [x21, x27]\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "str q29, [x22, x27]\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "str q28, [x23, x27]\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q27, [x24, x27]\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "str q26, [x25, x27]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q25, [x26, x27]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "str q24, [x19, x27]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "str q23, [x20, x27]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "str q22, [x21, x27]\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "str q21, [x22, x27]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "str q20, [x23, x27]\n"
+ "fmin v17.8h, v17.8h, v6.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q19, [x24, x27]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v17.8h, v17.8h, v7.8h\n"
+ "str q18, [x25, x27]\n"
+ "str q17, [x26, x27]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "lsl x27, x10, #0x1\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "ldr q2, [x28, #0x0]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmin v5.8h, v5.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmax v5.8h, v5.8h, v7.8h\n"
+ "str q5, [x19, x27]\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q31, [x20, x27]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q30, [x21, x27]\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "str q29, [x22, x27]\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "str q28, [x23, x27]\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q27, [x24, x27]\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "str q26, [x25, x27]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q25, [x26, x27]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "str q24, [x19, x27]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "str q23, [x20, x27]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "str q22, [x21, x27]\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "str q21, [x22, x27]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "str q20, [x23, x27]\n"
+ "fmin v17.8h, v17.8h, v6.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q19, [x24, x27]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v17.8h, v17.8h, v7.8h\n"
+ "str q18, [x25, x27]\n"
+ "str q17, [x26, x27]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "lsl x27, x10, #0x1\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmin v5.8h, v5.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmax v5.8h, v5.8h, v7.8h\n"
+ "str q5, [x19, x27]\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "str q31, [x20, x27]\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "str q30, [x21, x27]\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "str q29, [x22, x27]\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "str q28, [x23, x27]\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "str q27, [x24, x27]\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "str q26, [x25, x27]\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "str q25, [x26, x27]\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "str q24, [x19, x27]\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "str q23, [x20, x27]\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "str q22, [x21, x27]\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "str q21, [x22, x27]\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "str q20, [x23, x27]\n"
+ "fmin v17.8h, v17.8h, v6.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "str q19, [x24, x27]\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v17.8h, v17.8h, v7.8h\n"
+ "str q18, [x25, x27]\n"
+ "str q17, [x26, x27]\n"
+ "7:" // Output channel loop: Done
+ "add x10, x10, #0x8\n"
+ "cmp x10, x9, LSL #3\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x7\n"
+ "beq 23f\n"
+ "8:" // Output channel oddments
+ "movi v16.16b, #0x0\n"
+ "cbz %x[bias], 13f\n"
+ "add x19, %x[bias], x10, LSL #1\n"
+ "tbz %x[n_output_channels], #2, 10f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #1, 9f\n"
+ "ld1 { v16.s }[2], [x19], #0x4\n"
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v16.h }[6], [x19]\n"
+ "b 12f\n"
+ "9:" // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v16.h }[4], [x19]\n"
+ "b 12f\n"
+ "10:" // Output channel oddments: Load bias: Bit 2: Unset
+ "tbz %x[n_output_channels], #1, 11f\n"
+ "ld1 { v16.s }[0], [x19], #0x4\n"
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v16.h }[0], [x19]\n"
+ "12:" // Output channel oddments: Load bias: Bit 2: End
+
+ "13:" // Output channel oddments: Load bias: Done
+ "mov v5.16b, v16.16b\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "mov x19, %x[inptrs]\n"
+ "mov v31.16b, v16.16b\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "mov v30.16b, v16.16b\n"
+ "ldr q3, [x25, #0x0]\n"
+ "mov v29.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v28.16b, v16.16b\n"
+ "ldr q2, [x28, #0x0]\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "mov v18.16b, v16.16b\n"
+ "mov v17.16b, v16.16b\n"
+ "cbz x20, 17f\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "ldr q16, [%x[weights], #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q1, [x25, #0x0]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "beq 15f\n"
+ "14:" // Output channel oddments: Kernel loop
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "ldr q2, [x28, #0x0]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "ldr q1, [x25, #0x0]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "ldr q16, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 14b\n"
+ "15:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 16f\n"
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "b 18f\n"
+ "16:" // Output channel oddments: Odd tail
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "ldp x25, x28, [x19], #0x10\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "ldr q2, [x28, #0x0]\n"
+ "fmla v5.8h, v16.8h, v1.h[0]\n"
+ "ldr q4, [%x[weights], #0x0]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v31.8h, v16.8h, v1.h[1]\n"
+ "fmla v30.8h, v16.8h, v1.h[2]\n"
+ "fmla v29.8h, v16.8h, v1.h[3]\n"
+ "fmla v28.8h, v16.8h, v1.h[4]\n"
+ "fmla v27.8h, v16.8h, v1.h[5]\n"
+ "fmla v26.8h, v16.8h, v1.h[6]\n"
+ "fmla v25.8h, v16.8h, v1.h[7]\n"
+ "fmla v24.8h, v16.8h, v0.h[0]\n"
+ "fmla v23.8h, v16.8h, v0.h[1]\n"
+ "fmla v22.8h, v16.8h, v0.h[2]\n"
+ "fmla v21.8h, v16.8h, v0.h[3]\n"
+ "fmla v20.8h, v16.8h, v0.h[4]\n"
+ "fmla v19.8h, v16.8h, v0.h[5]\n"
+ "fmla v18.8h, v16.8h, v0.h[6]\n"
+ "fmla v17.8h, v16.8h, v0.h[7]\n"
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "b 18f\n"
+ "17:" // Output channel oddments: Single kernel point
+ "fmla v5.8h, v4.8h, v3.h[0]\n"
+ "fmla v31.8h, v4.8h, v3.h[1]\n"
+ "fmla v30.8h, v4.8h, v3.h[2]\n"
+ "fmla v29.8h, v4.8h, v3.h[3]\n"
+ "fmla v28.8h, v4.8h, v3.h[4]\n"
+ "fmla v27.8h, v4.8h, v3.h[5]\n"
+ "fmla v26.8h, v4.8h, v3.h[6]\n"
+ "fmla v25.8h, v4.8h, v3.h[7]\n"
+ "fmla v24.8h, v4.8h, v2.h[0]\n"
+ "fmla v23.8h, v4.8h, v2.h[1]\n"
+ "fmla v22.8h, v4.8h, v2.h[2]\n"
+ "fmla v21.8h, v4.8h, v2.h[3]\n"
+ "fmla v20.8h, v4.8h, v2.h[4]\n"
+ "fmla v19.8h, v4.8h, v2.h[5]\n"
+ "fmla v18.8h, v4.8h, v2.h[6]\n"
+ "fmla v17.8h, v4.8h, v2.h[7]\n"
+ "18:" // Output channel oddments: Done
+ "fmin v5.8h, v5.8h, v6.8h\n"
+ "fmin v31.8h, v31.8h, v6.8h\n"
+ "fmin v30.8h, v30.8h, v6.8h\n"
+ "fmin v29.8h, v29.8h, v6.8h\n"
+ "fmax v5.8h, v5.8h, v7.8h\n"
+ "fmax v31.8h, v31.8h, v7.8h\n"
+ "fmax v30.8h, v30.8h, v7.8h\n"
+ "fmax v29.8h, v29.8h, v7.8h\n"
+ "fmin v28.8h, v28.8h, v6.8h\n"
+ "fmin v27.8h, v27.8h, v6.8h\n"
+ "fmin v26.8h, v26.8h, v6.8h\n"
+ "fmax v28.8h, v28.8h, v7.8h\n"
+ "fmax v27.8h, v27.8h, v7.8h\n"
+ "fmax v26.8h, v26.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v6.8h\n"
+ "fmin v24.8h, v24.8h, v6.8h\n"
+ "fmin v23.8h, v23.8h, v6.8h\n"
+ "fmax v25.8h, v25.8h, v7.8h\n"
+ "fmax v24.8h, v24.8h, v7.8h\n"
+ "fmax v23.8h, v23.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v6.8h\n"
+ "fmin v21.8h, v21.8h, v6.8h\n"
+ "fmin v20.8h, v20.8h, v6.8h\n"
+ "fmax v22.8h, v22.8h, v7.8h\n"
+ "fmax v21.8h, v21.8h, v7.8h\n"
+ "fmax v20.8h, v20.8h, v7.8h\n"
+ "fmin v19.8h, v19.8h, v6.8h\n"
+ "fmin v18.8h, v18.8h, v6.8h\n"
+ "fmin v17.8h, v17.8h, v6.8h\n"
+ "fmax v19.8h, v19.8h, v7.8h\n"
+ "fmax v18.8h, v18.8h, v7.8h\n"
+ "fmax v17.8h, v17.8h, v7.8h\n"
+ "tbz %x[n_output_channels], #2, 20f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.d }[0], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.d }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "tbz %x[n_output_channels], #1, 19f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.s }[2], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.s }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.h }[6], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.h }[6], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.h }[6], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.h }[6], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[6], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x26]\n"
+ "b 22f\n"
+ "19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.h }[4], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.h }[4], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.h }[4], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.h }[4], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[4], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x26]\n"
+ "b 22f\n"
+ "20:" // Output channel oddments: Done: Store: Bit 2: Unset
+ "tbz %x[n_output_channels], #1, 21f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.s }[0], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.h }[2], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.h }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.h }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.h }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x26]\n"
+ "b 22f\n"
+ "21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v5.h }[0], [x19]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #1\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "22:" // Output channel oddments: Done: Store: Bit 2: End
+
+ "23:" // Done
+
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..88f20bb125
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fae208fbab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x15, #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
+ "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x12, x12, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1r { v18.4s }, [x24]\n"
+ "add x9, x12, x23, LSL #2\n"
+ "ld1r { v17.4s }, [x21]\n"
+ "add x28, x9, x23, LSL #2\n"
+ "lsl x13, x13, #0x2\n"
+ "add x27, x28, x23, LSL #2\n"
+ "add x26, x13, x13\n"
+ "add x25, x26, x13\n"
+ "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x15\n" // offset *= output_tile_size
+ "add x10, x10, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x24, x10, x20, LSL #2\n"
+ "lsl x11, x11, #0x2\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x2\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x9, x13]\n"
+ "ld1 { v10.4s }, [x12]\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q12, [x9, x26]\n"
+ "ldr q13, [x28, x13]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x27]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr q16, [x14, #0x0]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x9]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr q13, [x28, x13]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x12]\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x9, x26]\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "ldr q7, [x14, #0x80]\n"
+ "add x14, x14, #0xa0\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "st1 { v31.4s }, [x10]\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "st1 { v29.4s }, [x24]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "add x10, x10, #0x10\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x27]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x9]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x28]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "st1 { v31.4s }, [x10]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "st1 { v29.4s }, [x24]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 31f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x23, x9, x13\n"
+ "ldr q1, [x14, #0x20]\n"
+ "add x22, x12, XZR\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x21, x12, x25\n"
+ "ldr q3, [x14, #0x40]\n"
+ "add x20, x9, x26\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x19, x28, x13\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x23]\n"
+ "ld1 { v10.s }[2], [x22]\n"
+ "ld1 { v11.s }[2], [x21]\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ldr s9, [x23, #0x0]\n"
+ "ldr s10, [x22, #0x0]\n"
+ "ldr s11, [x21, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
+ "ldr s13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "add x19, x27, XZR\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "add x19, x27, x25\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "add x19, x12, x13\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x19, x12, x26\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "add x19, x28, x26\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "add x19, x9, XZR\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "add x19, x9, x25\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "add x19, x28, XZR\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "add x19, x28, x25\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "add x19, x27, x13\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "add x19, x27, x26\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "mov x19, x10\n"
+ "st1 { v31.d }[0], [x19], x11\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v30.d }[0], [x19]\n"
+ "mov x19, x24\n"
+ "st1 { v29.d }[0], [x19], x11\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "mov x20, x10\n"
+ "st1 { v31.s }[2], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19], x11\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x10\n"
+ "st1 { v31.s }[0], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.s }[0], [x20]\n"
+ "st1 { v29.s }[0], [x19], x11\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "30:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "31:" // Tile loop: End
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x17, #0x1\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x16, x16, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x16, x19\n"
+ "csel x16, x16, XZR, LT\n"
+ "csel x17, x17, x21, LT\n"
+ "cmp x17, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2f93a68c23
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[16];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x2\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr q13, [x22, x11]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x26, x11]\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x23, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "str q31, [x13, x28]\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 30f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "add x26, x26, x14\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v13.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x22], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x21, x21, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "7:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x16, #0x30]\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v12.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.s }[2], [x19], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v9.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "13:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v30.4s, v7.4s, v10.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "19:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "21:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v12.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.s }[2], [x19], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "st1 { v31.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x10], #0x8\n"
+ "st1 { v28.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "st1 { v31.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x10], #0x4\n"
+ "st1 { v28.s }[2], [x9], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "29:" // Oddments: Store: Bit 1: End
+
+ "30:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6a882ec52f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..401528aa59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x3\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x23, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x15, x15, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1r { v18.4s }, [x24]\n"
+ "add x12, x15, x22, LSL #2\n"
+ "ld1r { v17.4s }, [x21]\n"
+ "add x11, x12, x22, LSL #2\n"
+ "lsl x16, x16, #0x2\n"
+ "add x10, x11, x22, LSL #2\n"
+ "add x9, x10, x22, LSL #2\n"
+ "add x28, x16, x16\n"
+ "add x27, x28, x16\n"
+ "add x26, x27, x16\n"
+ "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x13, x13, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x25, x13, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
+ "lsl x14, x14, #0x2\n"
+ "add x22, x14, x14\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x2\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldr q9, [x11, x28]\n"
+ "ld1 { v10.4s }, [x15]\n"
+ "ldr q11, [x15, x26]\n"
+ "ld1 { v12.4s }, [x9]\n"
+ "ldr q13, [x12, x28]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "add x23, x23, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x10]\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "ld1 { v10.4s }, [x15]\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x11]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "ldr q9, [x11, x28]\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q4, [x17, #0x50]\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x9]\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x26]\n"
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "ldr q7, [x17, #0x80]\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x12, x28]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "ldr q6, [x17, #0x70]\n"
+ "add x17, x17, #0xa0\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "st1 { v31.4s }, [x13]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "st1 { v28.4s }, [x25]\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "st1 { v25.4s }, [x24]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x10]\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x11]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "st1 { v31.4s }, [x13]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "st1 { v28.4s }, [x25]\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "st1 { v25.4s }, [x24]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 49f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "add x23, x11, x28\n"
+ "ldr q1, [x17, #0x20]\n"
+ "add x22, x15, XZR\n"
+ "ldr q2, [x17, #0x30]\n"
+ "add x21, x15, x26\n"
+ "ldr q3, [x17, #0x40]\n"
+ "add x20, x9, XZR\n"
+ "ldr q4, [x17, #0x50]\n"
+ "add x19, x12, x28\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x23]\n"
+ "ld1 { v10.s }[2], [x22]\n"
+ "ld1 { v11.s }[2], [x21]\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ldr s9, [x23, #0x0]\n"
+ "ldr s10, [x22, #0x0]\n"
+ "ldr s11, [x21, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
+ "ldr s13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x19, x9, x26\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "add x19, x11, x16\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "add x19, x15, x16\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "add x19, x15, x27\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "add x19, x11, x27\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "add x19, x12, XZR\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "add x19, x12, x26\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "add x19, x10, XZR\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "add x19, x10, x28\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "add x19, x10, x26\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "add x19, x9, x16\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "add x19, x12, x16\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "add x19, x12, x27\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "add x19, x9, x27\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "add x19, x10, x16\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "add x19, x15, x28\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x19, x10, x27\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "add x19, x11, XZR\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "add x19, x11, x26\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "add x19, x9, x28\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "mov x19, x13\n"
+ "st1 { v31.d }[0], [x19], x14\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v30.d }[0], [x19], x14\n"
+ "mov x20, x25\n"
+ "st1 { v29.d }[0], [x19]\n"
+ "st1 { v28.d }[0], [x20], x14\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v27.d }[0], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v26.d }[0], [x20]\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v25.d }[0], [x19], x14\n"
+ "st1 { v24.d }[0], [x19], x14\n"
+ "st1 { v23.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "mov x21, x13\n"
+ "st1 { v31.s }[2], [x21], x14\n"
+ "mov x20, x25\n"
+ "st1 { v30.s }[2], [x21], x14\n"
+ "st1 { v28.s }[2], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20], x14\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "st1 { v25.s }[2], [x19], x14\n"
+ "st1 { v24.s }[2], [x19], x14\n"
+ "st1 { v23.s }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x13\n"
+ "st1 { v31.s }[0], [x21], x14\n"
+ "mov x20, x25\n"
+ "mov x19, x24\n"
+ "st1 { v30.s }[0], [x21], x14\n"
+ "st1 { v28.s }[0], [x20], x14\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v27.s }[0], [x20], x14\n"
+ "st1 { v26.s }[0], [x20]\n"
+ "st1 { v25.s }[0], [x19], x14\n"
+ "st1 { v24.s }[0], [x19], x14\n"
+ "st1 { v23.s }[0], [x19]\n"
+ "48:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "49:" // Tile loop: End
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x7, #0x1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x8, x8, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x8, x19\n"
+ "csel x8, x8, XZR, LT\n"
+ "csel x7, x7, x21, LT\n"
+ "cmp x7, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..39ec001ae1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x2\n"
+ "cbz x11, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "ldr q13, [x26, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "ldr x26, [x16, #0x20]\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "ldr q10, [x9, x13]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x26, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q31, [x22, x12]\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q30, [x21, x12]\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "str q29, [x20, x12]\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "str q25, [x20, x12]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q24, [x19, x12]\n"
+ "str q23, [x22, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q30, [x21, x12]\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q29, [x20, x12]\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "str q28, [x19, x12]\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "str q25, [x20, x12]\n"
+ "str q24, [x19, x12]\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "str q23, [x22, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 48f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x10], #0x8\n"
+ "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v12.d }[0], [x27], #0x8\n"
+ "ld1 { v13.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x10], #0x4\n"
+ "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x27], #0x4\n"
+ "ld1 { v13.s }[2], [x26], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+ "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v26.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v12.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "7:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "9:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v30.4s, v6.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.s }[2], [x10], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "13:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.s }[2], [x9], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "15:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.4s, v8.4s, v10.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.s }[2], [x27], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v13.s }[0], [x27], #0x4\n"
+ "19:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.s }[2], [x26], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "21:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "23:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v26.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "27:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v25.4s, v7.4s, v13.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v24.4s, v6.4s, v13.4s\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.s }[2], [x10], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "29:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "add x9, x9, x14\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.s }[2], [x9], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "31:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v11.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v13.s }[2], [x28], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x28], #0x4\n"
+ "33:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v24.4s, v8.4s, v13.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v23.4s, v7.4s, v13.4s\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v12.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.s }[2], [x27], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "35:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "add x26, x26, x14\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v11.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v11.s }[2], [x26], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "37:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "add x25, x25, x14\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v13.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "39:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.4s, v7.4s, v13.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v24.4s, v5.4s, v13.4s\n"
+ "fmla v23.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "41:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v25.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "43:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "add x10, x10, x14\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v13.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v13.s }[2], [x10], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v13.s }[0], [x10], #0x4\n"
+ "45:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v25.4s, v8.4s, v13.4s\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmax v27.4s, v27.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v18.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "fmin v27.4s, v27.4s, v17.4s\n"
+ "fmin v26.4s, v26.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v18.4s\n"
+ "fmax v23.4s, v23.4s, v18.4s\n"
+ "fmin v25.4s, v25.4s, v17.4s\n"
+ "fmin v24.4s, v24.4s, v17.4s\n"
+ "fmin v23.4s, v23.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.d }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.d }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.d }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.d }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.d }[0], [x19]\n"
+ "add x12, x12, #0x8\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[2], [x19]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "b 47f\n"
+ "46:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "47:" // Oddments: Store: Bit 1: End
+
+ "48:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..84bac12429
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..616fd0d0e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1229 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x4, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x24, #0x4\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x23, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x7, #0x0\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1r { v15.4s }, [x23]\n"
+ "add x15, x8, x22, LSL #2\n"
+ "ld1r { v14.4s }, [x21]\n"
+ "add x14, x15, x22, LSL #2\n"
+ "lsl x6, x6, #0x2\n"
+ "add x13, x14, x22, LSL #2\n"
+ "add x12, x13, x22, LSL #2\n"
+ "add x11, x12, x22, LSL #2\n"
+ "add x10, x6, x6\n"
+ "add x9, x10, x6\n"
+ "add x28, x9, x6\n"
+ "add x27, x28, x6\n"
+ "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x24\n" // offset *= output_tile_size
+ "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x26, x16, x20, LSL #2\n"
+ "add x25, x26, x20, LSL #2\n"
+ "add x24, x25, x20, LSL #2\n"
+ "lsl x17, x17, #0x2\n"
+ "add x23, x17, x17\n"
+ "add x22, x23, x17\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x2\n"
+ "cbz x19, 4f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x5, #0x20]\n"
+ "ldr q2, [x5, #0x30]\n"
+ "ldr q3, [x5, #0x40]\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "ldr q9, [x14, x10]\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "ldr q11, [x8, x27]\n"
+ "ldr q12, [x14, x9]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "add x7, x7, #0x10\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "add x21, x21, #0x10\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x15]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr q13, [x5, #0x0]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x14]\n"
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "ldr q9, [x14, x10]\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "ldr q0, [x5, #0x10]\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x5, #0x30]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x8, x27]\n"
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "ldr q1, [x5, #0x20]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x5, #0x70]\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x14, x9]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "ldr q3, [x5, #0x40]\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "ldr q7, [x5, #0x80]\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "ldr q5, [x5, #0x60]\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "ldr q4, [x5, #0x50]\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "st1 { v31.4s }, [x16]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "str q30, [x16, x17]\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "str q29, [x16, x23]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "st1 { v27.4s }, [x26]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q26, [x26, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q25, [x26, x23]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q24, [x26, x22]\n"
+ "add x26, x26, #0x10\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "st1 { v23.4s }, [x25]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q22, [x25, x17]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "str q21, [x25, x23]\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "str q20, [x25, x22]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "add x25, x25, #0x10\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "st1 { v19.4s }, [x24]\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x15]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x12]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x14]\n"
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "st1 { v31.4s }, [x16]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q30, [x16, x17]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q29, [x16, x23]\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "st1 { v27.4s }, [x26]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "str q26, [x26, x17]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x26, x23]\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q24, [x26, x22]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "st1 { v23.4s }, [x25]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "str q22, [x25, x17]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q21, [x25, x23]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q20, [x25, x22]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "add x25, x25, #0x10\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "st1 { v19.4s }, [x24]\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 73f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "add x22, x14, x10\n"
+ "ldr q1, [x5, #0x20]\n"
+ "add x21, x8, XZR\n"
+ "ldr q2, [x5, #0x30]\n"
+ "add x20, x8, x27\n"
+ "ldr q3, [x5, #0x40]\n"
+ "add x19, x14, x9\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x22], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "ldr d11, [x20], #0x8\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x22]\n"
+ "ld1 { v10.s }[2], [x21]\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ldr s9, [x22, #0x0]\n"
+ "ldr s10, [x21, #0x0]\n"
+ "ldr s11, [x20, #0x0]\n"
+ "ldr s12, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x19, x11, XZR\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "add x19, x11, x27\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "add x19, x13, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "add x19, x8, x6\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x19, x8, x28\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "add x19, x13, x9\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "add x19, x15, XZR\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "add x19, x15, x27\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "add x19, x12, XZR\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "add x19, x15, x10\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "add x19, x12, x27\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "add x19, x15, x9\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "add x19, x11, x6\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "add x19, x14, x6\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "add x19, x11, x28\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "add x19, x14, x28\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "add x19, x8, x10\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "add x19, x13, x6\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "add x19, x8, x9\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "add x19, x14, XZR\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "add x19, x13, x28\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "add x19, x14, x27\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "add x19, x13, XZR\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "add x19, x12, x10\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "add x19, x13, x27\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "add x19, x11, x10\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "add x19, x12, x9\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "add x19, x11, x9\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 62f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 62f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "add x19, x15, x6\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "add x19, x15, x28\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 66f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 66f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "add x19, x12, x6\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "add x19, x12, x28\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 70f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 70f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 71f\n"
+ "mov x19, x16\n"
+ "st1 { v31.d }[0], [x19], x17\n"
+ "add x16, x16, #0x8\n"
+ "st1 { v30.d }[0], [x19], x17\n"
+ "mov x21, x26\n"
+ "st1 { v29.d }[0], [x19], x17\n"
+ "st1 { v27.d }[0], [x21], x17\n"
+ "add x26, x26, #0x8\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "mov x20, x25\n"
+ "st1 { v26.d }[0], [x21], x17\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v25.d }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v24.d }[0], [x21]\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v23.d }[0], [x20], x17\n"
+ "st1 { v22.d }[0], [x20], x17\n"
+ "st1 { v21.d }[0], [x20], x17\n"
+ "st1 { v20.d }[0], [x20]\n"
+ "st1 { v19.d }[0], [x19], x17\n"
+ "st1 { v18.d }[0], [x19], x17\n"
+ "st1 { v17.d }[0], [x19], x17\n"
+ "st1 { v16.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "mov x22, x16\n"
+ "st1 { v31.s }[2], [x22], x17\n"
+ "mov x21, x26\n"
+ "st1 { v30.s }[2], [x22], x17\n"
+ "st1 { v27.s }[2], [x21], x17\n"
+ "mov x20, x25\n"
+ "st1 { v29.s }[2], [x22], x17\n"
+ "mov x19, x24\n"
+ "st1 { v28.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21], x17\n"
+ "st1 { v25.s }[2], [x21], x17\n"
+ "st1 { v24.s }[2], [x21]\n"
+ "st1 { v23.s }[2], [x20], x17\n"
+ "st1 { v22.s }[2], [x20], x17\n"
+ "st1 { v21.s }[2], [x20], x17\n"
+ "st1 { v20.s }[2], [x20]\n"
+ "st1 { v19.s }[2], [x19], x17\n"
+ "st1 { v18.s }[2], [x19], x17\n"
+ "st1 { v17.s }[2], [x19], x17\n"
+ "st1 { v16.s }[2], [x19]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x22, x16\n"
+ "st1 { v31.s }[0], [x22], x17\n"
+ "mov x21, x26\n"
+ "mov x20, x25\n"
+ "st1 { v30.s }[0], [x22], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v29.s }[0], [x22], x17\n"
+ "st1 { v28.s }[0], [x22]\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "st1 { v25.s }[0], [x21], x17\n"
+ "st1 { v24.s }[0], [x21]\n"
+ "st1 { v23.s }[0], [x20], x17\n"
+ "st1 { v22.s }[0], [x20], x17\n"
+ "st1 { v21.s }[0], [x20], x17\n"
+ "st1 { v20.s }[0], [x20]\n"
+ "st1 { v19.s }[0], [x19], x17\n"
+ "st1 { v18.s }[0], [x19], x17\n"
+ "st1 { v17.s }[0], [x19], x17\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "72:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "73:" // Tile loop: End
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x4, #0x1\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x26, x26, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x19\n"
+ "csel x26, x26, XZR, LT\n"
+ "csel x4, x4, x21, LT\n"
+ "cmp x4, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..51a5679bff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x2\n"
+ "cbz x11, 3f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "ldr q13, [x15, #0x0]\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x9, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "str q29, [x20, x12]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q27, [x22, x12]\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "str q18, [x21, x12]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "str q29, [x20, x12]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q28, [x19, x12]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q27, [x22, x12]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "str q18, [x21, x12]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 72f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x10], #0x8\n"
+ "ld1 { v10.d }[0], [x9], #0x8\n"
+ "ld1 { v11.d }[0], [x28], #0x8\n"
+ "ld1 { v12.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x10], #0x4\n"
+ "ld1 { v10.s }[2], [x9], #0x4\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x27], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "7:" // Oddments: Load input (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "9:" // Oddments: Load input (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "11:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "13:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.4s, v0.4s, v12.4s\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.s }[2], [x10], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (0, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "15:" // Oddments: Load input (0, 4): Bit 1: End
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v10.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v10.s }[2], [x9], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "17:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v21.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v18.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.s }[2], [x28], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x28], #0x4\n"
+ "19:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.s }[2], [x27], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (1, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "21:" // Oddments: Load input (1, 5): Bit 1: End
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v11.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.s }[2], [x26], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "23:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "25:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "27:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "29:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "add x10, x10, x14\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.s }[2], [x10], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "31:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v19.4s, v7.4s, v11.4s\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v10.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v10.s }[2], [x9], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "33:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v27.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v11.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "35:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v16.4s, v7.4s, v11.4s\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v12.s }[2], [x27], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "37:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "add x26, x26, x14\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "39:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "add x25, x25, x14\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "41:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v27.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v23.4s, v4.4s, v11.4s\n"
+ "fmla v22.4s, v3.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "43:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "45:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "add x10, x10, x14\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.s }[2], [x10], #0x4\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "47:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v24.4s, v7.4s, v11.4s\n"
+ "add x9, x9, x14\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "fmla v20.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x9], #0x4\n"
+ "49:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v20.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v10.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.s }[2], [x28], #0x4\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x28], #0x4\n"
+ "51:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v27.4s, v6.4s, v10.4s\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "add x27, x27, x14\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.s }[2], [x27], #0x4\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v11.s }[0], [x27], #0x4\n"
+ "53:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v23.4s, v8.4s, v11.4s\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "add x26, x26, x14\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.s }[2], [x26], #0x4\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "55:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v20.4s, v5.4s, v12.4s\n"
+ "add x25, x25, x14\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "57:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v19.4s, v8.4s, v10.4s\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v18.4s, v7.4s, v10.4s\n"
+ "add x24, x24, x14\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "59:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v20.4s, v6.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 61f\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "b 61f\n"
+ "60:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "61:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "add x10, x10, x14\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v10.d }[0], [x10], #0x8\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.s }[2], [x10], #0x4\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v10.s }[0], [x10], #0x4\n"
+ "63:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "add x9, x9, x14\n"
+ "fmla v27.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v11.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 65f\n"
+ "ld1 { v11.s }[2], [x9], #0x4\n"
+ "b 65f\n"
+ "64:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "65:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v28.4s, v4.4s, v11.4s\n"
+ "add x28, x28, x14\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v12.d }[0], [x28], #0x8\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.s }[2], [x28], #0x4\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x28], #0x4\n"
+ "67:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "add x27, x27, x14\n"
+ "fmla v19.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v10.d }[0], [x27], #0x8\n"
+ "tbz %x[n_channels], #0, 69f\n"
+ "ld1 { v10.s }[2], [x27], #0x4\n"
+ "b 69f\n"
+ "68:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v10.s }[0], [x27], #0x4\n"
+ "69:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v20.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 70f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.d }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.d }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.d }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.d }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.d }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.d }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.d }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.d }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.d }[0], [x20]\n"
+ "add x12, x12, #0x8\n"
+ "st1 { v16.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[2], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.s }[2], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.s }[2], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.s }[2], [x20]\n"
+ "st1 { v16.s }[2], [x19]\n"
+ "b 71f\n"
+ "70:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.s }[0], [x20]\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "71:" // Oddments: Store: Bit 1: End
+
+ "72:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8eb560562b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4466ec1974
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x6, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x17, x17, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1r { v19.4s }, [x24]\n"
+ "add x14, x17, x23, LSL #2\n"
+ "ld1r { v18.4s }, [x21]\n"
+ "add x13, x14, x23, LSL #2\n"
+ "lsl x8, x8, #0x2\n"
+ "add x12, x13, x23, LSL #2\n"
+ "add x11, x12, x23, LSL #2\n"
+ "add x10, x8, x8\n"
+ "add x9, x10, x8\n"
+ "add x28, x9, x8\n"
+ "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x15, x15, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x27, x15, x20, LSL #2\n"
+ "lsl x16, x16, #0x2\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x2\n"
+ "cbz x19, 4f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x7, #0x20]\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q4, [x7, #0x50]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "ldr q9, [x13, x10]\n"
+ "ld1 { v10.4s }, [x17]\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q12, [x17, x9]\n"
+ "ldr q13, [x17, x28]\n"
+ "ld1 { v14.4s }, [x14]\n"
+ "ldr q15, [x14, x8]\n"
+ "ldr q16, [x17, x10]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "add x17, x17, #0x10\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr q17, [x7, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x17]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x12]\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "ldr q0, [x7, #0x10]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x12, x9]\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x11]\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr q4, [x7, #0x50]\n"
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x17, x9]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "ldr q1, [x7, #0x20]\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr q13, [x17, x28]\n"
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x11, x9]\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "st1 { v31.4s }, [x15]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "ldr q16, [x17, x10]\n"
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "str q30, [x15, x16]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x14]\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "st1 { v29.4s }, [x27]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x14, x8]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x17, x17, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x12]\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x13]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x12, x9]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "ld1 { v15.4s }, [x11]\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x11, x9]\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "st1 { v31.4s }, [x15]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "str q30, [x15, x16]\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "st1 { v29.4s }, [x27]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 43f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "add x26, x13, x10\n"
+ "ldr q1, [x7, #0x20]\n"
+ "add x25, x17, XZR\n"
+ "ldr q2, [x7, #0x30]\n"
+ "add x24, x17, x8\n"
+ "ldr q3, [x7, #0x40]\n"
+ "add x23, x17, x9\n"
+ "ldr q4, [x7, #0x50]\n"
+ "add x22, x17, x28\n"
+ "ldr q5, [x7, #0x60]\n"
+ "add x21, x14, XZR\n"
+ "ldr q6, [x7, #0x70]\n"
+ "add x20, x14, x8\n"
+ "ldr q7, [x7, #0x80]\n"
+ "add x19, x17, x10\n"
+ "ldr q8, [x7, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x26], #0x8\n"
+ "ldr d10, [x25], #0x8\n"
+ "ldr d11, [x24], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "ldr d13, [x22], #0x8\n"
+ "ldr d14, [x21], #0x8\n"
+ "ldr d15, [x20], #0x8\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x26]\n"
+ "ld1 { v10.s }[2], [x25]\n"
+ "ld1 { v11.s }[2], [x24]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v13.s }[2], [x22]\n"
+ "ld1 { v14.s }[2], [x21]\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ldr s9, [x26, #0x0]\n"
+ "ldr s10, [x25, #0x0]\n"
+ "ldr s11, [x24, #0x0]\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s13, [x22, #0x0]\n"
+ "ldr s14, [x21, #0x0]\n"
+ "ldr s15, [x20, #0x0]\n"
+ "ldr s16, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "add x19, x14, x9\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "add x19, x14, x28\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "add x19, x14, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "add x19, x12, XZR\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d14, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.s }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s14, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "add x19, x13, XZR\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d15, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s15, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "add x19, x12, x8\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x19, x13, x8\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s16, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "add x19, x12, x9\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "add x19, x13, x9\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "add x19, x12, x28\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d14, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v14.s }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s14, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "add x19, x11, XZR\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d15, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s15, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "add x19, x13, x28\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "add x19, x11, x8\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "add x19, x12, x10\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s16, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "add x19, x11, x9\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d14, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v14.s }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s14, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "add x19, x11, x10\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d15, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s15, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "add x19, x11, x28\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "mov x19, x15\n"
+ "st1 { v31.d }[0], [x19], x16\n"
+ "add x15, x15, #0x8\n"
+ "st1 { v30.d }[0], [x19]\n"
+ "mov x19, x27\n"
+ "st1 { v29.d }[0], [x19], x16\n"
+ "add x27, x27, #0x8\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "mov x20, x15\n"
+ "st1 { v31.s }[2], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19], x16\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x15\n"
+ "st1 { v31.s }[0], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.s }[0], [x20]\n"
+ "st1 { v29.s }[0], [x19], x16\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "42:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "43:" // Tile loop: End
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x6, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x6, x6, x21, LT\n"
+ "cmp x6, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..a5153019e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v18.4s }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x2\n"
+ "cbz x27, 3f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr q14, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldr q15, [x20, x14]\n"
+ "ldr q16, [x19, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "ldr q9, [x26, x11]\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "ldr q12, [x23, x11]\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "ldr q13, [x22, x11]\n"
+ "ldr q14, [x21, x11]\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "ldr q16, [x19, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "str q29, [x10, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "str q28, [x9, x28]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "str q29, [x10, x28]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 42f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "ldr x24, [x16, #0x10]\n"
+ "add x26, x26, x14\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x26], #0x8\n"
+ "ld1 { v10.d }[0], [x25], #0x8\n"
+ "ld1 { v11.d }[0], [x24], #0x8\n"
+ "ld1 { v12.d }[0], [x23], #0x8\n"
+ "ld1 { v13.d }[0], [x22], #0x8\n"
+ "ld1 { v14.d }[0], [x21], #0x8\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x25], #0x4\n"
+ "ld1 { v11.s }[2], [x24], #0x4\n"
+ "ld1 { v12.s }[2], [x23], #0x4\n"
+ "ld1 { v13.s }[2], [x22], #0x4\n"
+ "ld1 { v14.s }[2], [x21], #0x4\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "ld1 { v16.s }[2], [x19], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v14.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x19], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v14.4s\n"
+ "fmla v30.4s, v0.4s, v16.4s\n"
+ "fmla v31.4s, v4.4s, v15.4s\n"
+ "fmla v31.4s, v2.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v11.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v11.s }[2], [x26], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v12.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "11:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.4s, v5.4s, v13.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v14.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v14.s }[2], [x23], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v14.s }[0], [x23], #0x4\n"
+ "13:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr x22, [x16, #0x60]\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v15.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v15.s }[2], [x22], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v15.s }[0], [x22], #0x4\n"
+ "15:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.4s, v0.4s, v15.4s\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "17:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.4s, v7.4s, v16.4s\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v13.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v13.s }[2], [x19], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x19], #0x4\n"
+ "21:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.s }[2], [x26], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v14.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.4s, v5.4s, v14.4s\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v15.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v15.s }[0], [x24], #0x4\n"
+ "27:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.4s, v6.4s, v15.4s\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "29:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v13.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v13.s }[2], [x22], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "31:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v16.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v16.s }[2], [x21], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v16.s }[0], [x21], #0x4\n"
+ "33:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v28.4s, v3.4s, v16.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "35:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.4s, v7.4s, v14.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v15.s }[2], [x19], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v15.s }[0], [x19], #0x4\n"
+ "37:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v15.4s\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v11.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v11.s }[2], [x26], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "39:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v19.4s\n"
+ "fmax v30.4s, v30.4s, v19.4s\n"
+ "fmax v29.4s, v29.4s, v19.4s\n"
+ "fmin v31.4s, v31.4s, v18.4s\n"
+ "fmin v30.4s, v30.4s, v18.4s\n"
+ "fmin v29.4s, v29.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v19.4s\n"
+ "fmin v28.4s, v28.4s, v18.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "st1 { v31.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x10], #0x8\n"
+ "st1 { v28.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "st1 { v31.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x10], #0x4\n"
+ "st1 { v28.s }[2], [x9], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "41:" // Oddments: Store: Bit 1: End
+
+ "42:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..314fe766de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..170eb2267b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,969 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x28, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1r { v18.4s }, [x24]\n"
+ "add x8, x5, x23, LSL #2\n"
+ "ld1r { v17.4s }, [x21]\n"
+ "add x17, x8, x23, LSL #2\n"
+ "lsl x4, x4, #0x2\n"
+ "add x16, x17, x23, LSL #2\n"
+ "add x15, x16, x23, LSL #2\n"
+ "add x14, x15, x23, LSL #2\n"
+ "add x13, x4, x4\n"
+ "add x12, x13, x4\n"
+ "add x11, x12, x4\n"
+ "add x10, x11, x4\n"
+ "mul x19, x28, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x7, x7, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x9, x7, x20, LSL #2\n"
+ "lsl x6, x6, #0x2\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x2\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x3, #0x20]\n"
+ "ldr q2, [x3, #0x30]\n"
+ "ldr q3, [x3, #0x40]\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x3, x3, #0x60\n"
+ "ld1 { v5.4s }, [x5]\n"
+ "ldr q6, [x5, x4]\n"
+ "ld1 { v7.4s }, [x8]\n"
+ "ldr q8, [x8, x4]\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q13, [x8, x13]\n"
+ "ldr q11, [x5, x12]\n"
+ "ldr q12, [x5, x11]\n"
+ "ldr q10, [x8, x10]\n"
+ "ld1 { v14.4s }, [x17]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x8, x12]\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q0, [x3, #0x0]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "ldr q16, [x3, #0x140]\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "ld1 { v7.4s }, [x8]\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x16]\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x15]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "ld1 { v14.4s }, [x17]\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x3, #0x130]\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr q0, [x3, #0x150]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "ldr q13, [x8, x13]\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr q1, [x3, #0x160]\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x5]\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x3, #0x170]\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "ldr q6, [x5, x4]\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x5, x12]\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x3, #0x180]\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x8, x4]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x5, x11]\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q4, [x3, #0x190]\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "add x3, x3, #0x1a0\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "st1 { v31.4s }, [x7]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "st1 { v29.4s }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x8, x12]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q0, [x3, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "ld1 { v5.4s }, [x16]\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x15]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ld1 { v11.4s }, [x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x3, #0x130]\n"
+ "add x3, x3, #0x140\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "st1 { v31.4s }, [x7]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "st1 { v29.4s }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 61f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "add x28, x5, XZR\n"
+ "ldr q1, [x3, #0x20]\n"
+ "add x27, x5, x4\n"
+ "ldr q2, [x3, #0x30]\n"
+ "add x26, x8, XZR\n"
+ "ldr q3, [x3, #0x40]\n"
+ "add x25, x8, x4\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x24, x5, x13\n"
+ "add x23, x8, x13\n"
+ "add x22, x5, x12\n"
+ "add x21, x5, x11\n"
+ "add x20, x8, x10\n"
+ "add x19, x17, XZR\n"
+ "add x3, x3, #0x60\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d5, [x28], #0x8\n"
+ "ldr d6, [x27], #0x8\n"
+ "ldr d7, [x26], #0x8\n"
+ "ldr d8, [x25], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d10, [x20], #0x8\n"
+ "ldr d14, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v5.s }[2], [x28]\n"
+ "ld1 { v6.s }[2], [x27]\n"
+ "ld1 { v7.s }[2], [x26]\n"
+ "ld1 { v8.s }[2], [x25]\n"
+ "ld1 { v9.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v11.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x21]\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "ld1 { v14.s }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ldr s5, [x28, #0x0]\n"
+ "ldr s6, [x27, #0x0]\n"
+ "ldr s7, [x26, #0x0]\n"
+ "ldr s8, [x25, #0x0]\n"
+ "ldr s9, [x24, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s11, [x22, #0x0]\n"
+ "ldr s12, [x21, #0x0]\n"
+ "ldr s10, [x20, #0x0]\n"
+ "ldr s14, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "add x19, x8, x12\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d5, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.s }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s5, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "add x19, x8, x11\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d6, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v6.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s6, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "add x19, x5, x10\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "ldr s0, [x3, #0x18]\n"
+ "add x19, x17, x4\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr s1, [x3, #0x1c]\n"
+ "add x19, x17, x13\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr s2, [x3, #0x20]\n"
+ "add x19, x17, x12\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr s3, [x3, #0x24]\n"
+ "add x19, x17, x11\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr s4, [x3, #0x28]\n"
+ "add x19, x17, x10\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d8, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v8.s }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr s8, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr s0, [x3, #0x2c]\n"
+ "add x19, x16, XZR\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d5, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v5.s }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s5, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "add x19, x16, x4\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d6, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v6.s }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s6, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr s1, [x3, #0x30]\n"
+ "add x19, x16, x13\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr s2, [x3, #0x34]\n"
+ "add x19, x16, x12\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr s3, [x3, #0x38]\n"
+ "add x19, x16, x11\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr s4, [x3, #0x3c]\n"
+ "add x19, x16, x10\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d14, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v14.s }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr s14, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr s0, [x3, #0x40]\n"
+ "add x19, x15, XZR\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "add x19, x15, x4\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d13, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr s1, [x3, #0x44]\n"
+ "add x19, x15, x13\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d5, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v5.s }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s5, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr s2, [x3, #0x48]\n"
+ "add x19, x15, x12\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d6, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v6.s }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s6, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr s3, [x3, #0x4c]\n"
+ "add x19, x15, x11\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d8, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v8.s }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s8, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr s4, [x3, #0x50]\n"
+ "add x19, x15, x10\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d10, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr s10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr s0, [x3, #0x54]\n"
+ "add x19, x14, XZR\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "add x19, x14, x4\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr s1, [x3, #0x58]\n"
+ "add x19, x14, x13\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr s2, [x3, #0x5c]\n"
+ "add x19, x14, x12\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr d11, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr s11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr s3, [x3, #0x60]\n"
+ "add x19, x14, x11\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr d12, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr s12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr s4, [x3, #0x64]\n"
+ "add x19, x14, x10\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr d9, [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v9.s }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr s9, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "mov x19, x7\n"
+ "st1 { v31.d }[0], [x19], x6\n"
+ "add x7, x7, #0x8\n"
+ "st1 { v30.d }[0], [x19]\n"
+ "mov x19, x9\n"
+ "st1 { v29.d }[0], [x19], x6\n"
+ "add x9, x9, #0x8\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "mov x20, x7\n"
+ "st1 { v31.s }[2], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19], x6\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x7\n"
+ "st1 { v31.s }[0], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.s }[0], [x20]\n"
+ "st1 { v29.s }[0], [x19], x6\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "60:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "61:" // Tile loop: End
+ "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x28, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x28, x28, x21, LT\n"
+ "cmp x28, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..de66a8c485
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x2\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x15, x15, #0x60\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q5, [x26, x14]\n"
+ "ldr q6, [x25, x14]\n"
+ "ldr q7, [x24, x14]\n"
+ "ldr q8, [x23, x14]\n"
+ "ldr q9, [x22, x14]\n"
+ "ldr q13, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q12, [x19, x14]\n"
+ "ldr q10, [x26, x14]\n"
+ "ldr q14, [x25, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q2, [x15, #0x70]\n"
+ "ldr q16, [x15, #0x140]\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x15, #0x130]\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldr q0, [x15, #0x150]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr q1, [x15, #0x160]\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "ldr q5, [x26, x11]\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "ldr q6, [x25, x11]\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "ldr q7, [x24, x11]\n"
+ "ldr q13, [x21, x11]\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "ldr q8, [x23, x11]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q11, [x20, x11]\n"
+ "ldr q12, [x19, x11]\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "ldr q9, [x22, x11]\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "ldr q2, [x15, #0x170]\n"
+ "ldr q3, [x15, #0x180]\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "ldr q10, [x26, x11]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "ldr q14, [x25, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q31, [x13, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q30, [x12, x28]\n"
+ "ldr q4, [x15, #0x190]\n"
+ "add x15, x15, #0x1a0\n"
+ "str q29, [x10, x28]\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr q2, [x15, #0x70]\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q4, [x15, #0x130]\n"
+ "add x15, x15, #0x140\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "str q30, [x12, x28]\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 60f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x20, x20, x14\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x19, x19, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x15, x15, #0x60\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v5.d }[0], [x26], #0x8\n"
+ "ld1 { v6.d }[0], [x25], #0x8\n"
+ "ld1 { v7.d }[0], [x24], #0x8\n"
+ "ld1 { v8.d }[0], [x23], #0x8\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v13.d }[0], [x21], #0x8\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "ld1 { v12.d }[0], [x19], #0x8\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v14.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v7.s }[2], [x24], #0x4\n"
+ "ld1 { v8.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x26], #0x4\n"
+ "ld1 { v6.s }[2], [x25], #0x4\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v13.s }[2], [x21], #0x4\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "ld1 { v12.s }[2], [x19], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ld1 { v5.s }[0], [x26], #0x4\n"
+ "ld1 { v6.s }[0], [x25], #0x4\n"
+ "ld1 { v7.s }[0], [x24], #0x4\n"
+ "ld1 { v8.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+ "add x24, x24, x14\n"
+ "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v5.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.s }[2], [x24], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "add x23, x23, x14\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v6.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v6.s }[2], [x23], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 5): Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "11:" // Oddments: Load input (0, 5): Bit 1: End
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "ldr s0, [x15, #0x18]\n"
+ "fmla v29.4s, v4.4s, v6.4s\n"
+ "ldr x21, [x16, #0x68]\n"
+ "add x21, x21, x14\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v7.4s\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "fmla v29.4s, v0.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "13:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v11.4s\n"
+ "ldr s1, [x15, #0x1c]\n"
+ "fmla v31.4s, v1.4s, v8.4s\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr s2, [x15, #0x20]\n"
+ "fmla v31.4s, v2.4s, v13.4s\n"
+ "ldr x19, [x16, #0x78]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v9.s }[2], [x19], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "17:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "ldr s3, [x15, #0x24]\n"
+ "fmla v31.4s, v3.4s, v5.4s\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.s }[2], [x26], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "19:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr s4, [x15, #0x28]\n"
+ "fmla v31.4s, v4.4s, v6.4s\n"
+ "ldr x25, [x16, #0x88]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v8.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v8.s }[2], [x25], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v8.s }[0], [x25], #0x4\n"
+ "21:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "ldr s0, [x15, #0x2c]\n"
+ "fmla v31.4s, v0.4s, v14.4s\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v5.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v5.s }[2], [x24], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "23:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v5.4s\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v6.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v6.s }[2], [x23], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v6.4s\n"
+ "ldr s1, [x15, #0x30]\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v10.4s\n"
+ "ldr s2, [x15, #0x34]\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "29:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr s3, [x15, #0x38]\n"
+ "fmla v31.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.4s, v3.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "31:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr s4, [x15, #0x3c]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v14.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v14.s }[2], [x19], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v14.s }[0], [x19], #0x4\n"
+ "33:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v14.4s\n"
+ "ldr s0, [x15, #0x40]\n"
+ "fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.4s, v0.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v9.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "35:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v9.4s\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v13.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "37:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "ldr s1, [x15, #0x44]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v5.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v5.s }[2], [x24], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "39:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v5.4s\n"
+ "ldr s2, [x15, #0x48]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v6.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v6.s }[2], [x23], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "41:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v6.4s\n"
+ "ldr s3, [x15, #0x4c]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v8.d }[0], [x22], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v8.s }[2], [x22], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "43:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v8.4s\n"
+ "ldr s4, [x15, #0x50]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.4s, v4.4s, v14.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "45:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr s0, [x15, #0x54]\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "47:" // Oddments: Load input (5, 0): Bit 1: End
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.s }[2], [x19], #0x4\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "49:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "ldr s1, [x15, #0x58]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "ldr x26, [x16, #0x100]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.4s, v1.4s, v5.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v9.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "51:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr s2, [x15, #0x5c]\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "ldr x25, [x16, #0x108]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.4s, v2.4s, v6.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "53:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr s3, [x15, #0x60]\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "ldr x24, [x16, #0x110]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.4s, v3.4s, v8.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "55:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "ldr s4, [x15, #0x64]\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "ldr x23, [x16, #0x118]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v9.d }[0], [x23], #0x8\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "57:" // Oddments: Load input (5, 5): Bit 1: End
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmax v31.4s, v31.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v18.4s\n"
+ "fmin v31.4s, v31.4s, v17.4s\n"
+ "fmin v30.4s, v30.4s, v17.4s\n"
+ "fmin v29.4s, v29.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v18.4s\n"
+ "fmin v28.4s, v28.4s, v17.4s\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "st1 { v31.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x10], #0x8\n"
+ "st1 { v28.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "st1 { v31.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x10], #0x4\n"
+ "st1 { v28.s }[2], [x9], #0x4\n"
+ "b 59f\n"
+ "58:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "59:" // Oddments: Store: Bit 1: End
+
+ "60:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0f6cecdc56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_generic_output9_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e8e817e9cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v4.4s }, [%x[minmax_vals]]\n"
+ "add x19, %x[minmax_vals], #0x4\n"
+ "mov x11, #0x0\n"
+ "ld1r { v3.4s }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x2\n"
+ "cbz x10, 5f\n"
+ "1:" // Channel loop
+ "movi v25.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q25, [%x[bias], x11]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v24.16b, v25.16b\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v22.16b, v25.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v21.16b, v25.16b\n"
+ "ldr q2, [x9, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "add %x[params], %x[params], #0x10\n"
+ "mov v19.16b, v25.16b\n"
+ "ldr q1, [x28, x11]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q0, [x27, x11]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ldr q28, [x23, x11]\n"
+ "ldr q27, [x22, x11]\n"
+ "ldr x21, [x20], #0x8\n"
+ "ldr q26, [x21, x11]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "fmla v25.4s, v2.4s, v23.4s\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "fmla v24.4s, v1.4s, v23.4s\n"
+ "ldr q2, [x9, x11]\n"
+ "fmla v22.4s, v0.4s, v23.4s\n"
+ "fmla v21.4s, v31.4s, v23.4s\n"
+ "ldr q1, [x28, x11]\n"
+ "fmla v20.4s, v30.4s, v23.4s\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "fmla v19.4s, v29.4s, v23.4s\n"
+ "fmla v18.4s, v28.4s, v23.4s\n"
+ "ldr q0, [x27, x11]\n"
+ "fmla v17.4s, v27.4s, v23.4s\n"
+ "fmla v16.4s, v26.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ldr q28, [x23, x11]\n"
+ "ldr q27, [x22, x11]\n"
+ "ldr x21, [x20], #0x8\n"
+ "ldr q26, [x21, x11]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla v25.4s, v2.4s, v23.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "fmla v24.4s, v1.4s, v23.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v0.4s, v23.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "fmla v21.4s, v31.4s, v23.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "fmla v20.4s, v30.4s, v23.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmla v19.4s, v29.4s, v23.4s\n"
+ "fmla v18.4s, v28.4s, v23.4s\n"
+ "fmla v17.4s, v27.4s, v23.4s\n"
+ "fmla v16.4s, v26.4s, v23.4s\n"
+ "fmax v25.4s, v25.4s, v4.4s\n"
+ "fmax v24.4s, v24.4s, v4.4s\n"
+ "fmax v22.4s, v22.4s, v4.4s\n"
+ "fmin v25.4s, v25.4s, v3.4s\n"
+ "str q25, [x27, x11]\n"
+ "fmin v24.4s, v24.4s, v3.4s\n"
+ "fmin v22.4s, v22.4s, v3.4s\n"
+ "str q24, [x26, x11]\n"
+ "fmax v21.4s, v21.4s, v4.4s\n"
+ "fmax v20.4s, v20.4s, v4.4s\n"
+ "str q22, [x25, x11]\n"
+ "fmax v19.4s, v19.4s, v4.4s\n"
+ "fmax v18.4s, v18.4s, v4.4s\n"
+ "fmin v21.4s, v21.4s, v3.4s\n"
+ "str q21, [x24, x11]\n"
+ "fmin v20.4s, v20.4s, v3.4s\n"
+ "fmin v19.4s, v19.4s, v3.4s\n"
+ "str q20, [x23, x11]\n"
+ "fmin v18.4s, v18.4s, v3.4s\n"
+ "fmax v17.4s, v17.4s, v4.4s\n"
+ "str q19, [x22, x11]\n"
+ "fmax v16.4s, v16.4s, v4.4s\n"
+ "str q18, [x21, x11]\n"
+ "fmin v17.4s, v17.4s, v3.4s\n"
+ "fmin v16.4s, v16.4s, v3.4s\n"
+ "str q17, [x20, x11]\n"
+ "str q16, [x19, x11]\n"
+ "add x11, x11, #0x10\n"
+ "cmp x11, x10, LSL #4\n"
+ "blt 1b\n"
+ "5:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 17f\n"
+ "movi v25.16b, #0x0\n"
+ "cbz %x[bias], 8f\n"
+ "add x19, %x[bias], x11\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v25.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v25.s }[2], [x19], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v25.s }[0], [x19], #0x4\n"
+ "7:" // Oddments: Load bias: Bit 1: End
+
+ "8:" // Oddments: Load bias: Done
+ "mov v24.16b, v25.16b\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v22.16b, v25.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add %x[params], %x[params], #0x10\n"
+ "mov v21.16b, v25.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v20.16b, v25.16b\n"
+ "add x9, x9, x11\n"
+ "mov v19.16b, v25.16b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "mov v18.16b, v25.16b\n"
+ "add x28, x28, x11\n"
+ "mov v17.16b, v25.16b\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "mov v16.16b, v25.16b\n"
+ "add x27, x27, x11\n"
+ "ldr x21, [x20], #0x8\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d2, [x9], #0x8\n"
+ "ldr d1, [x28], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d30, [x25], #0x8\n"
+ "ldr d29, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v2.s }[2], [x9], #0x4\n"
+ "ld1 { v1.s }[2], [x28], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v29.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "b 10f\n"
+ "9:" // Oddments: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ldr s2, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s30, [x25], #0x4\n"
+ "ldr s29, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "10:" // Oddments: Load: Bit 1: End
+ "subs x19, %x[n_points], #0x1\n"
+ "ble 14f\n"
+ "11:" // Oddments: Planar loop
+ "fmla v25.4s, v2.4s, v23.4s\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "fmla v24.4s, v1.4s, v23.4s\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "fmla v22.4s, v0.4s, v23.4s\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "fmla v21.4s, v31.4s, v23.4s\n"
+ "add x28, x28, x11\n"
+ "fmla v20.4s, v30.4s, v23.4s\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "fmla v19.4s, v29.4s, v23.4s\n"
+ "add x27, x27, x11\n"
+ "fmla v18.4s, v28.4s, v23.4s\n"
+ "ldr x21, [x20], #0x8\n"
+ "fmla v17.4s, v27.4s, v23.4s\n"
+ "add x26, x26, x11\n"
+ "fmla v16.4s, v26.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ldr d2, [x9], #0x8\n"
+ "ldr d1, [x28], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d30, [x25], #0x8\n"
+ "ldr d29, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v2.s }[2], [x9], #0x4\n"
+ "ld1 { v1.s }[2], [x28], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v30.s }[2], [x25], #0x4\n"
+ "ld1 { v29.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 13f\n"
+ "ldr s2, [x9], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s30, [x25], #0x4\n"
+ "ldr s29, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: End
+ "subs x19, x19, #0x1\n"
+ "bgt 11b\n"
+ "14:" // Oddments: Planar tail
+ "fmla v25.4s, v2.4s, v23.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "add x27, x27, x11\n"
+ "fmla v24.4s, v1.4s, v23.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v0.4s, v23.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "add x26, x26, x11\n"
+ "fmla v21.4s, v31.4s, v23.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "fmla v20.4s, v30.4s, v23.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x25, x25, x11\n"
+ "fmla v19.4s, v29.4s, v23.4s\n"
+ "add x24, x24, x11\n"
+ "fmla v18.4s, v28.4s, v23.4s\n"
+ "add x23, x23, x11\n"
+ "fmla v17.4s, v27.4s, v23.4s\n"
+ "add x22, x22, x11\n"
+ "fmla v16.4s, v26.4s, v23.4s\n"
+ "add x21, x21, x11\n"
+ "fmax v25.4s, v25.4s, v4.4s\n"
+ "add x20, x20, x11\n"
+ "fmax v24.4s, v24.4s, v4.4s\n"
+ "add x19, x19, x11\n"
+ "fmax v22.4s, v22.4s, v4.4s\n"
+ "fmin v25.4s, v25.4s, v3.4s\n"
+ "fmin v24.4s, v24.4s, v3.4s\n"
+ "fmin v22.4s, v22.4s, v3.4s\n"
+ "fmax v21.4s, v21.4s, v4.4s\n"
+ "fmax v20.4s, v20.4s, v4.4s\n"
+ "fmax v19.4s, v19.4s, v4.4s\n"
+ "fmin v21.4s, v21.4s, v3.4s\n"
+ "fmin v20.4s, v20.4s, v3.4s\n"
+ "fmin v19.4s, v19.4s, v3.4s\n"
+ "fmax v18.4s, v18.4s, v4.4s\n"
+ "fmax v17.4s, v17.4s, v4.4s\n"
+ "fmax v16.4s, v16.4s, v4.4s\n"
+ "fmin v18.4s, v18.4s, v3.4s\n"
+ "fmin v17.4s, v17.4s, v3.4s\n"
+ "fmin v16.4s, v16.4s, v3.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "st1 { v25.d }[0], [x27], #0x8\n"
+ "st1 { v24.d }[0], [x26], #0x8\n"
+ "st1 { v22.d }[0], [x25], #0x8\n"
+ "st1 { v21.d }[0], [x24], #0x8\n"
+ "st1 { v20.d }[0], [x23], #0x8\n"
+ "st1 { v19.d }[0], [x22], #0x8\n"
+ "st1 { v18.d }[0], [x21], #0x8\n"
+ "st1 { v17.d }[0], [x20], #0x8\n"
+ "st1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "st1 { v25.s }[2], [x27], #0x4\n"
+ "st1 { v24.s }[2], [x26], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v21.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v19.s }[2], [x22], #0x4\n"
+ "st1 { v18.s }[2], [x21], #0x4\n"
+ "st1 { v17.s }[2], [x20], #0x4\n"
+ "st1 { v16.s }[2], [x19], #0x4\n"
+ "b 16f\n"
+ "15:" // Oddments: Store: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "st1 { v25.s }[0], [x27], #0x4\n"
+ "st1 { v24.s }[0], [x26], #0x4\n"
+ "st1 { v22.s }[0], [x25], #0x4\n"
+ "st1 { v21.s }[0], [x24], #0x4\n"
+ "st1 { v20.s }[0], [x23], #0x4\n"
+ "st1 { v19.s }[0], [x22], #0x4\n"
+ "st1 { v18.s }[0], [x21], #0x4\n"
+ "st1 { v17.s }[0], [x20], #0x4\n"
+ "st1 { v16.s }[0], [x19], #0x4\n"
+ "16:" // Oddments: Store: Bit 1: End
+
+ "17:" // End
+
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..60f5ddd68f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 7;
+ constexpr static unsigned int input_cols = 7;
+ constexpr static unsigned int input_col_quads = 2;
+
+ kern_type kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+
+ a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5e334ec7b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ldp x14, x13, [%x[outptrs], #0x0]\n"
+ "add x12, %x[clamps], #0x4\n"
+ "ldp x11, x10, [%x[outptrs], #0x10]\n"
+ "mov x9, #0x0\n"
+ "ldp x28, x27, [%x[outptrs], #0x20]\n"
+ "mov x26, #0x0\n"
+ "ldp x25, x24, [%x[outptrs], #0x30]\n"
+ "lsr x23, %x[channel_multiplier], #0x2\n"
+ "ldr x22, [%x[outptrs], #0x40]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "ldr q1, [x21, #0x10]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr q4, [x19, #0x0]\n"
+ "ldr q5, [x19, #0x10]\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "ldr q6, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "ldr q10, [x19, #0x0]\n"
+ "ldr q11, [x19, #0x10]\n"
+ "ldr x19, [%x[inptrs], #0x30]\n"
+ "ld1r { v24.4s }, [%x[clamps]]\n"
+ "ld1r { v23.4s }, [x12]\n"
+ "ldr q12, [x19, #0x0]\n"
+ "ldr q13, [x19, #0x10]\n"
+ "cbz x23, 3f\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "mov v15.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "subs x23, x23, #0x1\n"
+ "mov v16.16b, v14.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "mov v17.16b, v14.16b\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "mov v22.16b, v14.16b\n"
+ "beq 2f\n"
+ "1:" // Output channel complete vector loop
+ "fmla v14.4s, v31.4s, v0.s[0]\n"
+ "add x9, x9, #0x4\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v16.4s, v31.4s, v1.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[0]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v5.s[0]\n"
+ "fmla v20.4s, v31.4s, v8.s[0]\n"
+ "fmla v21.4s, v31.4s, v8.s[2]\n"
+ "fmla v22.4s, v31.4s, v9.s[0]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v15.4s, v30.4s, v0.s[3]\n"
+ "fmla v16.4s, v30.4s, v1.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[1]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[1]\n"
+ "fmla v20.4s, v30.4s, v8.s[1]\n"
+ "fmla v21.4s, v30.4s, v8.s[3]\n"
+ "fmla v22.4s, v30.4s, v9.s[1]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "fmla v14.4s, v29.4s, v0.s[2]\n"
+ "fmla v15.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v1.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[2]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[2]\n"
+ "fmla v20.4s, v29.4s, v8.s[2]\n"
+ "fmla v21.4s, v29.4s, v9.s[0]\n"
+ "fmla v22.4s, v29.4s, v9.s[2]\n"
+ "ldr q29, [%x[params], #0x20]\n"
+ "fmla v14.4s, v31.4s, v2.s[0]\n"
+ "fmla v15.4s, v31.4s, v2.s[2]\n"
+ "fmla v16.4s, v31.4s, v3.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[0]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v7.s[0]\n"
+ "fmla v20.4s, v31.4s, v10.s[0]\n"
+ "fmla v21.4s, v31.4s, v10.s[2]\n"
+ "fmla v22.4s, v31.4s, v11.s[0]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "fmla v14.4s, v30.4s, v2.s[1]\n"
+ "fmla v15.4s, v30.4s, v2.s[3]\n"
+ "fmla v16.4s, v30.4s, v3.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[1]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[1]\n"
+ "fmla v20.4s, v30.4s, v10.s[1]\n"
+ "fmla v21.4s, v30.4s, v10.s[3]\n"
+ "fmla v22.4s, v30.4s, v11.s[1]\n"
+ "ldr q30, [%x[params], #0x40]\n"
+ "fmla v14.4s, v29.4s, v2.s[2]\n"
+ "fmla v15.4s, v29.4s, v3.s[0]\n"
+ "fmla v16.4s, v29.4s, v3.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[2]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[2]\n"
+ "fmla v20.4s, v29.4s, v10.s[2]\n"
+ "fmla v21.4s, v29.4s, v11.s[0]\n"
+ "fmla v22.4s, v29.4s, v11.s[2]\n"
+ "ldr q29, [%x[params], #0x50]\n"
+ "fmla v14.4s, v31.4s, v4.s[0]\n"
+ "fmla v15.4s, v31.4s, v4.s[2]\n"
+ "fmla v16.4s, v31.4s, v5.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[0]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v9.s[0]\n"
+ "fmla v20.4s, v31.4s, v12.s[0]\n"
+ "fmla v21.4s, v31.4s, v12.s[2]\n"
+ "fmla v22.4s, v31.4s, v13.s[0]\n"
+ "ldr q31, [%x[params], #0x70]\n"
+ "fmla v14.4s, v30.4s, v4.s[1]\n"
+ "fmla v15.4s, v30.4s, v4.s[3]\n"
+ "fmla v16.4s, v30.4s, v5.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[1]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[1]\n"
+ "fmla v20.4s, v30.4s, v12.s[1]\n"
+ "fmla v21.4s, v30.4s, v12.s[3]\n"
+ "fmla v22.4s, v30.4s, v13.s[1]\n"
+ "ldr q30, [%x[params], #0x80]\n"
+ "fmla v14.4s, v29.4s, v4.s[2]\n"
+ "fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmla v16.4s, v29.4s, v5.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmla v20.4s, v29.4s, v12.s[2]\n"
+ "fmla v21.4s, v29.4s, v13.s[0]\n"
+ "fmla v22.4s, v29.4s, v13.s[2]\n"
+ "ldr q29, [%x[params], #0x90]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "str q14, [x14, x26]\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "ldr q14, [%x[params], #0x60]\n"
+ "add %x[params], %x[params], #0xa0\n"
+ "fmin v17.4s, v17.4s, v23.4s\n"
+ "str q15, [x13, x26]\n"
+ "fmin v18.4s, v18.4s, v23.4s\n"
+ "fmin v19.4s, v19.4s, v23.4s\n"
+ "str q16, [x11, x26]\n"
+ "fmin v20.4s, v20.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "str q17, [x10, x26]\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "str q18, [x28, x26]\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "str q19, [x27, x26]\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
+ "str q20, [x25, x26]\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "mov v15.16b, v14.16b\n"
+ "str q21, [x24, x26]\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "mov v16.16b, v14.16b\n"
+ "str q22, [x22, x26]\n"
+ "mov v17.16b, v14.16b\n"
+ "add x26, x26, #0x10\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "mov v22.16b, v14.16b\n"
+ "bgt 1b\n"
+ "2:" // Output channel complete vector tail
+ "fmla v14.4s, v31.4s, v0.s[0]\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "fmla v16.4s, v31.4s, v1.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[0]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v5.s[0]\n"
+ "fmla v20.4s, v31.4s, v8.s[0]\n"
+ "fmla v21.4s, v31.4s, v8.s[2]\n"
+ "fmla v22.4s, v31.4s, v9.s[0]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v15.4s, v30.4s, v0.s[3]\n"
+ "fmla v16.4s, v30.4s, v1.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[1]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[1]\n"
+ "fmla v20.4s, v30.4s, v8.s[1]\n"
+ "fmla v21.4s, v30.4s, v8.s[3]\n"
+ "fmla v22.4s, v30.4s, v9.s[1]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "fmla v14.4s, v29.4s, v0.s[2]\n"
+ "fmla v15.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v1.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[2]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[2]\n"
+ "fmla v20.4s, v29.4s, v8.s[2]\n"
+ "fmla v21.4s, v29.4s, v9.s[0]\n"
+ "fmla v22.4s, v29.4s, v9.s[2]\n"
+ "ldr q29, [%x[params], #0x20]\n"
+ "fmla v14.4s, v31.4s, v2.s[0]\n"
+ "fmla v15.4s, v31.4s, v2.s[2]\n"
+ "fmla v16.4s, v31.4s, v3.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[0]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v7.s[0]\n"
+ "fmla v20.4s, v31.4s, v10.s[0]\n"
+ "fmla v21.4s, v31.4s, v10.s[2]\n"
+ "fmla v22.4s, v31.4s, v11.s[0]\n"
+ "ldr q31, [%x[params], #0x30]\n"
+ "fmla v14.4s, v30.4s, v2.s[1]\n"
+ "fmla v15.4s, v30.4s, v2.s[3]\n"
+ "fmla v16.4s, v30.4s, v3.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[1]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[1]\n"
+ "fmla v20.4s, v30.4s, v10.s[1]\n"
+ "fmla v21.4s, v30.4s, v10.s[3]\n"
+ "fmla v22.4s, v30.4s, v11.s[1]\n"
+ "ldr q30, [%x[params], #0x40]\n"
+ "fmla v14.4s, v29.4s, v2.s[2]\n"
+ "fmla v15.4s, v29.4s, v3.s[0]\n"
+ "fmla v16.4s, v29.4s, v3.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[2]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[2]\n"
+ "fmla v20.4s, v29.4s, v10.s[2]\n"
+ "fmla v21.4s, v29.4s, v11.s[0]\n"
+ "fmla v22.4s, v29.4s, v11.s[2]\n"
+ "ldr q29, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "fmla v14.4s, v31.4s, v4.s[0]\n"
+ "fmla v15.4s, v31.4s, v4.s[2]\n"
+ "fmla v16.4s, v31.4s, v5.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[0]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v9.s[0]\n"
+ "fmla v20.4s, v31.4s, v12.s[0]\n"
+ "fmla v21.4s, v31.4s, v12.s[2]\n"
+ "fmla v22.4s, v31.4s, v13.s[0]\n"
+ "fmla v14.4s, v30.4s, v4.s[1]\n"
+ "fmla v15.4s, v30.4s, v4.s[3]\n"
+ "fmla v16.4s, v30.4s, v5.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[1]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[1]\n"
+ "fmla v20.4s, v30.4s, v12.s[1]\n"
+ "fmla v21.4s, v30.4s, v12.s[3]\n"
+ "fmla v22.4s, v30.4s, v13.s[1]\n"
+ "fmla v14.4s, v29.4s, v4.s[2]\n"
+ "fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmla v16.4s, v29.4s, v5.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmla v20.4s, v29.4s, v12.s[2]\n"
+ "fmla v21.4s, v29.4s, v13.s[0]\n"
+ "fmla v22.4s, v29.4s, v13.s[2]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "str q14, [x14, x26]\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "str q15, [x13, x26]\n"
+ "fmin v17.4s, v17.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v23.4s\n"
+ "str q16, [x11, x26]\n"
+ "fmin v19.4s, v19.4s, v23.4s\n"
+ "fmin v20.4s, v20.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "str q17, [x10, x26]\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "str q18, [x28, x26]\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "str q19, [x27, x26]\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
+ "str q20, [x25, x26]\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "str q21, [x24, x26]\n"
+ "str q22, [x22, x26]\n"
+ "add x26, x26, #0x10\n"
+ "3:" // Output channel oddments
+ "tst %x[channel_multiplier], #0x3\n"
+ "beq 6f\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "mov v15.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "mov v16.16b, v14.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "mov v17.16b, v14.16b\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "mov v22.16b, v14.16b\n"
+ "fmla v14.4s, v31.4s, v0.s[0]\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "fmla v16.4s, v31.4s, v1.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[0]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v5.s[0]\n"
+ "fmla v20.4s, v31.4s, v8.s[0]\n"
+ "fmla v21.4s, v31.4s, v8.s[2]\n"
+ "fmla v22.4s, v31.4s, v9.s[0]\n"
+ "ldr q31, [%x[params], #0x40]\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v15.4s, v30.4s, v0.s[3]\n"
+ "fmla v16.4s, v30.4s, v1.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[1]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[1]\n"
+ "fmla v20.4s, v30.4s, v8.s[1]\n"
+ "fmla v21.4s, v30.4s, v8.s[3]\n"
+ "fmla v22.4s, v30.4s, v9.s[1]\n"
+ "ldr q30, [%x[params], #0x50]\n"
+ "fmla v14.4s, v29.4s, v0.s[2]\n"
+ "fmla v15.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v1.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[2]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[2]\n"
+ "fmla v20.4s, v29.4s, v8.s[2]\n"
+ "fmla v21.4s, v29.4s, v9.s[0]\n"
+ "fmla v22.4s, v29.4s, v9.s[2]\n"
+ "ldr q29, [%x[params], #0x60]\n"
+ "fmla v14.4s, v31.4s, v2.s[0]\n"
+ "fmla v15.4s, v31.4s, v2.s[2]\n"
+ "fmla v16.4s, v31.4s, v3.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[0]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v7.s[0]\n"
+ "fmla v20.4s, v31.4s, v10.s[0]\n"
+ "fmla v21.4s, v31.4s, v10.s[2]\n"
+ "fmla v22.4s, v31.4s, v11.s[0]\n"
+ "ldr q31, [%x[params], #0x70]\n"
+ "fmla v14.4s, v30.4s, v2.s[1]\n"
+ "fmla v15.4s, v30.4s, v2.s[3]\n"
+ "fmla v16.4s, v30.4s, v3.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[1]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[1]\n"
+ "fmla v20.4s, v30.4s, v10.s[1]\n"
+ "fmla v21.4s, v30.4s, v10.s[3]\n"
+ "fmla v22.4s, v30.4s, v11.s[1]\n"
+ "ldr q30, [%x[params], #0x80]\n"
+ "fmla v14.4s, v29.4s, v2.s[2]\n"
+ "fmla v15.4s, v29.4s, v3.s[0]\n"
+ "fmla v16.4s, v29.4s, v3.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[2]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[2]\n"
+ "fmla v20.4s, v29.4s, v10.s[2]\n"
+ "fmla v21.4s, v29.4s, v11.s[0]\n"
+ "fmla v22.4s, v29.4s, v11.s[2]\n"
+ "ldr q29, [%x[params], #0x90]\n"
+ "add %x[params], %x[params], #0xa0\n"
+ "fmla v14.4s, v31.4s, v4.s[0]\n"
+ "fmla v15.4s, v31.4s, v4.s[2]\n"
+ "fmla v16.4s, v31.4s, v5.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[0]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v9.s[0]\n"
+ "fmla v20.4s, v31.4s, v12.s[0]\n"
+ "fmla v21.4s, v31.4s, v12.s[2]\n"
+ "fmla v22.4s, v31.4s, v13.s[0]\n"
+ "fmla v14.4s, v30.4s, v4.s[1]\n"
+ "fmla v15.4s, v30.4s, v4.s[3]\n"
+ "fmla v16.4s, v30.4s, v5.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[1]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[1]\n"
+ "fmla v20.4s, v30.4s, v12.s[1]\n"
+ "fmla v21.4s, v30.4s, v12.s[3]\n"
+ "fmla v22.4s, v30.4s, v13.s[1]\n"
+ "fmla v14.4s, v29.4s, v4.s[2]\n"
+ "fmla v15.4s, v29.4s, v5.s[0]\n"
+ "fmla v16.4s, v29.4s, v5.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[2]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[2]\n"
+ "fmla v20.4s, v29.4s, v12.s[2]\n"
+ "fmla v21.4s, v29.4s, v13.s[0]\n"
+ "fmla v22.4s, v29.4s, v13.s[2]\n"
+ "fmin v14.4s, v14.4s, v23.4s\n"
+ "fmin v15.4s, v15.4s, v23.4s\n"
+ "fmin v16.4s, v16.4s, v23.4s\n"
+ "fmax v14.4s, v14.4s, v24.4s\n"
+ "fmax v15.4s, v15.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v24.4s\n"
+ "fmin v17.4s, v17.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v23.4s\n"
+ "fmin v19.4s, v19.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v24.4s\n"
+ "fmax v18.4s, v18.4s, v24.4s\n"
+ "fmax v19.4s, v19.4s, v24.4s\n"
+ "fmin v20.4s, v20.4s, v23.4s\n"
+ "fmin v21.4s, v21.4s, v23.4s\n"
+ "fmin v22.4s, v22.4s, v23.4s\n"
+ "fmax v20.4s, v20.4s, v24.4s\n"
+ "fmax v21.4s, v21.4s, v24.4s\n"
+ "fmax v22.4s, v22.4s, v24.4s\n"
+ "tbz %x[channel_multiplier], #1, 4f\n"
+ "add x19, x14, x26\n"
+ "st1 { v14.d }[0], [x19]\n"
+ "add x19, x13, x26\n"
+ "st1 { v15.d }[0], [x19]\n"
+ "add x19, x11, x26\n"
+ "st1 { v16.d }[0], [x19]\n"
+ "add x19, x10, x26\n"
+ "st1 { v17.d }[0], [x19]\n"
+ "add x19, x28, x26\n"
+ "st1 { v18.d }[0], [x19]\n"
+ "add x19, x27, x26\n"
+ "st1 { v19.d }[0], [x19]\n"
+ "add x19, x25, x26\n"
+ "st1 { v20.d }[0], [x19]\n"
+ "add x19, x24, x26\n"
+ "st1 { v21.d }[0], [x19]\n"
+ "add x19, x22, x26\n"
+ "st1 { v22.d }[0], [x19]\n"
+ "add x26, x26, #0x8\n"
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x19, x14, x26\n"
+ "st1 { v14.s }[2], [x19]\n"
+ "add x19, x13, x26\n"
+ "st1 { v15.s }[2], [x19]\n"
+ "add x19, x11, x26\n"
+ "st1 { v16.s }[2], [x19]\n"
+ "add x19, x10, x26\n"
+ "st1 { v17.s }[2], [x19]\n"
+ "add x19, x28, x26\n"
+ "st1 { v18.s }[2], [x19]\n"
+ "add x19, x27, x26\n"
+ "st1 { v19.s }[2], [x19]\n"
+ "add x19, x25, x26\n"
+ "st1 { v20.s }[2], [x19]\n"
+ "add x19, x24, x26\n"
+ "st1 { v21.s }[2], [x19]\n"
+ "add x19, x22, x26\n"
+ "st1 { v22.s }[2], [x19]\n"
+ "b 5f\n"
+ "4:" // Output channel oddments: Store: Bit 1: Unset
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x19, x14, x26\n"
+ "st1 { v14.s }[0], [x19]\n"
+ "add x19, x13, x26\n"
+ "st1 { v15.s }[0], [x19]\n"
+ "add x19, x11, x26\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "add x19, x10, x26\n"
+ "st1 { v17.s }[0], [x19]\n"
+ "add x19, x28, x26\n"
+ "st1 { v18.s }[0], [x19]\n"
+ "add x19, x27, x26\n"
+ "st1 { v19.s }[0], [x19]\n"
+ "add x19, x25, x26\n"
+ "st1 { v20.s }[0], [x19]\n"
+ "add x19, x24, x26\n"
+ "st1 { v21.s }[0], [x19]\n"
+ "add x19, x22, x26\n"
+ "st1 { v22.s }[0], [x19]\n"
+ "5:" // Output channel oddments: Store: Bit 1: End
+
+ "6:" // End
+
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..92d6a757f2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 8;
+ constexpr static unsigned int input_col_quads = 2;
+
+ kern_type kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+
+ a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..6e9e97fa29
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ldp x13, x12, [%x[outptrs], #0x0]\n"
+ "add x11, %x[clamps], #0x4\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x20]\n"
+ "mov x25, #0x0\n"
+ "ldp x24, x23, [%x[outptrs], #0x30]\n"
+ "lsr x22, %x[channel_multiplier], #0x2\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "ldr q0, [x21, #0x0]\n"
+ "ldr q1, [x21, #0x10]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr q4, [x19, #0x0]\n"
+ "ldr q5, [x19, #0x10]\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "ldr q6, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "ldr q10, [x19, #0x0]\n"
+ "ldr q11, [x19, #0x10]\n"
+ "ld1r { v21.4s }, [%x[clamps]]\n"
+ "ld1r { v20.4s }, [x11]\n"
+ "cbz x22, 3f\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "mov v13.16b, v12.16b\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "subs x22, x22, #0x1\n"
+ "mov v14.16b, v12.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "mov v15.16b, v12.16b\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "mov v16.16b, v12.16b\n"
+ "ldr q28, [%x[params], #0x40]\n"
+ "mov v17.16b, v12.16b\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v19.16b, v12.16b\n"
+ "beq 2f\n"
+ "1:" // Output channel complete vector loop
+ "fmla v12.4s, v31.4s, v0.s[0]\n"
+ "add x28, x28, #0x4\n"
+ "fmla v13.4s, v31.4s, v0.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ "fmla v14.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v31.4s, v0.s[3]\n"
+ "fmla v16.4s, v31.4s, v2.s[0]\n"
+ "fmla v17.4s, v31.4s, v2.s[1]\n"
+ "fmla v18.4s, v31.4s, v2.s[2]\n"
+ "fmla v19.4s, v31.4s, v2.s[3]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "fmla v12.4s, v30.4s, v0.s[1]\n"
+ "fmla v13.4s, v30.4s, v0.s[2]\n"
+ "fmla v14.4s, v30.4s, v0.s[3]\n"
+ "fmla v15.4s, v30.4s, v1.s[0]\n"
+ "fmla v16.4s, v30.4s, v2.s[1]\n"
+ "fmla v17.4s, v30.4s, v2.s[2]\n"
+ "fmla v18.4s, v30.4s, v2.s[3]\n"
+ "fmla v19.4s, v30.4s, v3.s[0]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v13.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v15.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v17.4s, v29.4s, v2.s[3]\n"
+ "fmla v18.4s, v29.4s, v3.s[0]\n"
+ "fmla v19.4s, v29.4s, v3.s[1]\n"
+ "ldr q29, [%x[params], #0x20]\n"
+ "fmla v12.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v14.4s, v28.4s, v1.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v16.4s, v28.4s, v2.s[3]\n"
+ "fmla v17.4s, v28.4s, v3.s[0]\n"
+ "fmla v18.4s, v28.4s, v3.s[1]\n"
+ "fmla v19.4s, v28.4s, v3.s[2]\n"
+ "ldr q28, [%x[params], #0x30]\n"
+ "fmla v12.4s, v27.4s, v1.s[0]\n"
+ "fmla v13.4s, v27.4s, v1.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[2]\n"
+ "fmla v15.4s, v27.4s, v1.s[3]\n"
+ "fmla v16.4s, v27.4s, v3.s[0]\n"
+ "fmla v17.4s, v27.4s, v3.s[1]\n"
+ "fmla v18.4s, v27.4s, v3.s[2]\n"
+ "fmla v19.4s, v27.4s, v3.s[3]\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "fmla v12.4s, v31.4s, v2.s[0]\n"
+ "fmla v13.4s, v31.4s, v2.s[1]\n"
+ "fmla v14.4s, v31.4s, v2.s[2]\n"
+ "fmla v15.4s, v31.4s, v2.s[3]\n"
+ "fmla v16.4s, v31.4s, v4.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[1]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v4.s[3]\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "fmla v12.4s, v30.4s, v2.s[1]\n"
+ "fmla v13.4s, v30.4s, v2.s[2]\n"
+ "fmla v14.4s, v30.4s, v2.s[3]\n"
+ "fmla v15.4s, v30.4s, v3.s[0]\n"
+ "fmla v16.4s, v30.4s, v4.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[2]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[0]\n"
+ "ldr q30, [%x[params], #0x60]\n"
+ "fmla v12.4s, v29.4s, v2.s[2]\n"
+ "fmla v13.4s, v29.4s, v2.s[3]\n"
+ "fmla v14.4s, v29.4s, v3.s[0]\n"
+ "fmla v15.4s, v29.4s, v3.s[1]\n"
+ "fmla v16.4s, v29.4s, v4.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[3]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[1]\n"
+ "ldr q29, [%x[params], #0x70]\n"
+ "fmla v12.4s, v28.4s, v2.s[3]\n"
+ "fmla v13.4s, v28.4s, v3.s[0]\n"
+ "fmla v14.4s, v28.4s, v3.s[1]\n"
+ "fmla v15.4s, v28.4s, v3.s[2]\n"
+ "fmla v16.4s, v28.4s, v4.s[3]\n"
+ "fmla v17.4s, v28.4s, v5.s[0]\n"
+ "fmla v18.4s, v28.4s, v5.s[1]\n"
+ "fmla v19.4s, v28.4s, v5.s[2]\n"
+ "ldr q28, [%x[params], #0x80]\n"
+ "fmla v12.4s, v27.4s, v3.s[0]\n"
+ "fmla v13.4s, v27.4s, v3.s[1]\n"
+ "fmla v14.4s, v27.4s, v3.s[2]\n"
+ "fmla v15.4s, v27.4s, v3.s[3]\n"
+ "fmla v16.4s, v27.4s, v5.s[0]\n"
+ "fmla v17.4s, v27.4s, v5.s[1]\n"
+ "fmla v18.4s, v27.4s, v5.s[2]\n"
+ "fmla v19.4s, v27.4s, v5.s[3]\n"
+ "ldr q27, [%x[params], #0x90]\n"
+ "fmla v12.4s, v31.4s, v4.s[0]\n"
+ "fmla v13.4s, v31.4s, v4.s[1]\n"
+ "fmla v14.4s, v31.4s, v4.s[2]\n"
+ "fmla v15.4s, v31.4s, v4.s[3]\n"
+ "fmla v16.4s, v31.4s, v6.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[1]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v6.s[3]\n"
+ "ldr q31, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v30.4s, v4.s[1]\n"
+ "fmla v13.4s, v30.4s, v4.s[2]\n"
+ "fmla v14.4s, v30.4s, v4.s[3]\n"
+ "fmla v15.4s, v30.4s, v5.s[0]\n"
+ "fmla v16.4s, v30.4s, v6.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[2]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[0]\n"
+ "ldr q30, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v29.4s, v4.s[2]\n"
+ "fmla v13.4s, v29.4s, v4.s[3]\n"
+ "fmla v14.4s, v29.4s, v5.s[0]\n"
+ "fmla v15.4s, v29.4s, v5.s[1]\n"
+ "fmla v16.4s, v29.4s, v6.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[3]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[1]\n"
+ "ldr q29, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v28.4s, v4.s[3]\n"
+ "fmla v13.4s, v28.4s, v5.s[0]\n"
+ "fmla v14.4s, v28.4s, v5.s[1]\n"
+ "fmla v15.4s, v28.4s, v5.s[2]\n"
+ "fmla v16.4s, v28.4s, v6.s[3]\n"
+ "fmla v17.4s, v28.4s, v7.s[0]\n"
+ "fmla v18.4s, v28.4s, v7.s[1]\n"
+ "fmla v19.4s, v28.4s, v7.s[2]\n"
+ "ldr q28, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v27.4s, v5.s[0]\n"
+ "fmla v13.4s, v27.4s, v5.s[1]\n"
+ "fmla v14.4s, v27.4s, v5.s[2]\n"
+ "fmla v15.4s, v27.4s, v5.s[3]\n"
+ "fmla v16.4s, v27.4s, v7.s[0]\n"
+ "fmla v17.4s, v27.4s, v7.s[1]\n"
+ "fmla v18.4s, v27.4s, v7.s[2]\n"
+ "fmla v19.4s, v27.4s, v7.s[3]\n"
+ "ldr q27, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v31.4s, v6.s[0]\n"
+ "fmla v13.4s, v31.4s, v6.s[1]\n"
+ "fmla v14.4s, v31.4s, v6.s[2]\n"
+ "fmla v15.4s, v31.4s, v6.s[3]\n"
+ "fmla v16.4s, v31.4s, v8.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[1]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v8.s[3]\n"
+ "ldr q31, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v30.4s, v6.s[1]\n"
+ "fmla v13.4s, v30.4s, v6.s[2]\n"
+ "fmla v14.4s, v30.4s, v6.s[3]\n"
+ "fmla v15.4s, v30.4s, v7.s[0]\n"
+ "fmla v16.4s, v30.4s, v8.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[2]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[0]\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "fmla v12.4s, v29.4s, v6.s[2]\n"
+ "fmla v13.4s, v29.4s, v6.s[3]\n"
+ "fmla v14.4s, v29.4s, v7.s[0]\n"
+ "fmla v15.4s, v29.4s, v7.s[1]\n"
+ "fmla v16.4s, v29.4s, v8.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[3]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[1]\n"
+ "ldr q29, [%x[params], #0x110]\n"
+ "fmla v12.4s, v28.4s, v6.s[3]\n"
+ "fmla v13.4s, v28.4s, v7.s[0]\n"
+ "fmla v14.4s, v28.4s, v7.s[1]\n"
+ "fmla v15.4s, v28.4s, v7.s[2]\n"
+ "fmla v16.4s, v28.4s, v8.s[3]\n"
+ "fmla v17.4s, v28.4s, v9.s[0]\n"
+ "fmla v18.4s, v28.4s, v9.s[1]\n"
+ "fmla v19.4s, v28.4s, v9.s[2]\n"
+ "ldr q28, [%x[params], #0x120]\n"
+ "fmla v12.4s, v27.4s, v7.s[0]\n"
+ "fmla v13.4s, v27.4s, v7.s[1]\n"
+ "fmla v14.4s, v27.4s, v7.s[2]\n"
+ "fmla v15.4s, v27.4s, v7.s[3]\n"
+ "fmla v16.4s, v27.4s, v9.s[0]\n"
+ "fmla v17.4s, v27.4s, v9.s[1]\n"
+ "fmla v18.4s, v27.4s, v9.s[2]\n"
+ "fmla v19.4s, v27.4s, v9.s[3]\n"
+ "ldr q27, [%x[params], #0x130]\n"
+ "fmla v12.4s, v31.4s, v8.s[0]\n"
+ "fmla v13.4s, v31.4s, v8.s[1]\n"
+ "fmla v14.4s, v31.4s, v8.s[2]\n"
+ "fmla v15.4s, v31.4s, v8.s[3]\n"
+ "fmla v16.4s, v31.4s, v10.s[0]\n"
+ "fmla v17.4s, v31.4s, v10.s[1]\n"
+ "fmla v18.4s, v31.4s, v10.s[2]\n"
+ "fmla v19.4s, v31.4s, v10.s[3]\n"
+ "ldr q31, [%x[params], #0x150]\n"
+ "fmla v12.4s, v30.4s, v8.s[1]\n"
+ "fmla v13.4s, v30.4s, v8.s[2]\n"
+ "fmla v14.4s, v30.4s, v8.s[3]\n"
+ "fmla v15.4s, v30.4s, v9.s[0]\n"
+ "fmla v16.4s, v30.4s, v10.s[1]\n"
+ "fmla v17.4s, v30.4s, v10.s[2]\n"
+ "fmla v18.4s, v30.4s, v10.s[3]\n"
+ "fmla v19.4s, v30.4s, v11.s[0]\n"
+ "ldr q30, [%x[params], #0x160]\n"
+ "fmla v12.4s, v29.4s, v8.s[2]\n"
+ "fmla v13.4s, v29.4s, v8.s[3]\n"
+ "fmla v14.4s, v29.4s, v9.s[0]\n"
+ "fmla v15.4s, v29.4s, v9.s[1]\n"
+ "fmla v16.4s, v29.4s, v10.s[2]\n"
+ "fmla v17.4s, v29.4s, v10.s[3]\n"
+ "fmla v18.4s, v29.4s, v11.s[0]\n"
+ "fmla v19.4s, v29.4s, v11.s[1]\n"
+ "ldr q29, [%x[params], #0x170]\n"
+ "fmla v12.4s, v28.4s, v8.s[3]\n"
+ "fmla v13.4s, v28.4s, v9.s[0]\n"
+ "fmla v14.4s, v28.4s, v9.s[1]\n"
+ "fmla v15.4s, v28.4s, v9.s[2]\n"
+ "fmla v16.4s, v28.4s, v10.s[3]\n"
+ "fmla v17.4s, v28.4s, v11.s[0]\n"
+ "fmla v18.4s, v28.4s, v11.s[1]\n"
+ "fmla v19.4s, v28.4s, v11.s[2]\n"
+ "ldr q28, [%x[params], #0x180]\n"
+ "fmla v12.4s, v27.4s, v9.s[0]\n"
+ "fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmla v14.4s, v27.4s, v9.s[2]\n"
+ "fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmla v16.4s, v27.4s, v11.s[0]\n"
+ "fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmla v18.4s, v27.4s, v11.s[2]\n"
+ "fmla v19.4s, v27.4s, v11.s[3]\n"
+ "ldr q27, [%x[params], #0x190]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "str q12, [x13, x25]\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
+ "ldr q12, [%x[params], #0x140]\n"
+ "add %x[params], %x[params], #0x1a0\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "str q13, [x12, x25]\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "str q14, [x10, x25]\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v21.4s\n"
+ "str q15, [x9, x25]\n"
+ "fmax v16.4s, v16.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "str q16, [x27, x25]\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "str q17, [x26, x25]\n"
+ "mov v13.16b, v12.16b\n"
+ "str q18, [x24, x25]\n"
+ "fmax v19.4s, v19.4s, v21.4s\n"
+ "mov v14.16b, v12.16b\n"
+ "str q19, [x23, x25]\n"
+ "mov v15.16b, v12.16b\n"
+ "add x25, x25, #0x10\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v17.16b, v12.16b\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v19.16b, v12.16b\n"
+ "bgt 1b\n"
+ "2:" // Output channel complete vector tail
+ "fmla v12.4s, v31.4s, v0.s[0]\n"
+ "fmla v13.4s, v31.4s, v0.s[1]\n"
+ "fmla v14.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v31.4s, v0.s[3]\n"
+ "fmla v16.4s, v31.4s, v2.s[0]\n"
+ "fmla v17.4s, v31.4s, v2.s[1]\n"
+ "fmla v18.4s, v31.4s, v2.s[2]\n"
+ "fmla v19.4s, v31.4s, v2.s[3]\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "fmla v12.4s, v30.4s, v0.s[1]\n"
+ "fmla v13.4s, v30.4s, v0.s[2]\n"
+ "fmla v14.4s, v30.4s, v0.s[3]\n"
+ "fmla v15.4s, v30.4s, v1.s[0]\n"
+ "fmla v16.4s, v30.4s, v2.s[1]\n"
+ "fmla v17.4s, v30.4s, v2.s[2]\n"
+ "fmla v18.4s, v30.4s, v2.s[3]\n"
+ "fmla v19.4s, v30.4s, v3.s[0]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v13.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v15.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v17.4s, v29.4s, v2.s[3]\n"
+ "fmla v18.4s, v29.4s, v3.s[0]\n"
+ "fmla v19.4s, v29.4s, v3.s[1]\n"
+ "ldr q29, [%x[params], #0x20]\n"
+ "fmla v12.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v14.4s, v28.4s, v1.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v16.4s, v28.4s, v2.s[3]\n"
+ "fmla v17.4s, v28.4s, v3.s[0]\n"
+ "fmla v18.4s, v28.4s, v3.s[1]\n"
+ "fmla v19.4s, v28.4s, v3.s[2]\n"
+ "ldr q28, [%x[params], #0x30]\n"
+ "fmla v12.4s, v27.4s, v1.s[0]\n"
+ "fmla v13.4s, v27.4s, v1.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[2]\n"
+ "fmla v15.4s, v27.4s, v1.s[3]\n"
+ "fmla v16.4s, v27.4s, v3.s[0]\n"
+ "fmla v17.4s, v27.4s, v3.s[1]\n"
+ "fmla v18.4s, v27.4s, v3.s[2]\n"
+ "fmla v19.4s, v27.4s, v3.s[3]\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "fmla v12.4s, v31.4s, v2.s[0]\n"
+ "fmla v13.4s, v31.4s, v2.s[1]\n"
+ "fmla v14.4s, v31.4s, v2.s[2]\n"
+ "fmla v15.4s, v31.4s, v2.s[3]\n"
+ "fmla v16.4s, v31.4s, v4.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[1]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v4.s[3]\n"
+ "ldr q31, [%x[params], #0x50]\n"
+ "fmla v12.4s, v30.4s, v2.s[1]\n"
+ "fmla v13.4s, v30.4s, v2.s[2]\n"
+ "fmla v14.4s, v30.4s, v2.s[3]\n"
+ "fmla v15.4s, v30.4s, v3.s[0]\n"
+ "fmla v16.4s, v30.4s, v4.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[2]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[0]\n"
+ "ldr q30, [%x[params], #0x60]\n"
+ "fmla v12.4s, v29.4s, v2.s[2]\n"
+ "fmla v13.4s, v29.4s, v2.s[3]\n"
+ "fmla v14.4s, v29.4s, v3.s[0]\n"
+ "fmla v15.4s, v29.4s, v3.s[1]\n"
+ "fmla v16.4s, v29.4s, v4.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[3]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[1]\n"
+ "ldr q29, [%x[params], #0x70]\n"
+ "fmla v12.4s, v28.4s, v2.s[3]\n"
+ "fmla v13.4s, v28.4s, v3.s[0]\n"
+ "fmla v14.4s, v28.4s, v3.s[1]\n"
+ "fmla v15.4s, v28.4s, v3.s[2]\n"
+ "fmla v16.4s, v28.4s, v4.s[3]\n"
+ "fmla v17.4s, v28.4s, v5.s[0]\n"
+ "fmla v18.4s, v28.4s, v5.s[1]\n"
+ "fmla v19.4s, v28.4s, v5.s[2]\n"
+ "ldr q28, [%x[params], #0x80]\n"
+ "fmla v12.4s, v27.4s, v3.s[0]\n"
+ "fmla v13.4s, v27.4s, v3.s[1]\n"
+ "fmla v14.4s, v27.4s, v3.s[2]\n"
+ "fmla v15.4s, v27.4s, v3.s[3]\n"
+ "fmla v16.4s, v27.4s, v5.s[0]\n"
+ "fmla v17.4s, v27.4s, v5.s[1]\n"
+ "fmla v18.4s, v27.4s, v5.s[2]\n"
+ "fmla v19.4s, v27.4s, v5.s[3]\n"
+ "ldr q27, [%x[params], #0x90]\n"
+ "fmla v12.4s, v31.4s, v4.s[0]\n"
+ "fmla v13.4s, v31.4s, v4.s[1]\n"
+ "fmla v14.4s, v31.4s, v4.s[2]\n"
+ "fmla v15.4s, v31.4s, v4.s[3]\n"
+ "fmla v16.4s, v31.4s, v6.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[1]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v6.s[3]\n"
+ "ldr q31, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v30.4s, v4.s[1]\n"
+ "fmla v13.4s, v30.4s, v4.s[2]\n"
+ "fmla v14.4s, v30.4s, v4.s[3]\n"
+ "fmla v15.4s, v30.4s, v5.s[0]\n"
+ "fmla v16.4s, v30.4s, v6.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[2]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[0]\n"
+ "ldr q30, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v29.4s, v4.s[2]\n"
+ "fmla v13.4s, v29.4s, v4.s[3]\n"
+ "fmla v14.4s, v29.4s, v5.s[0]\n"
+ "fmla v15.4s, v29.4s, v5.s[1]\n"
+ "fmla v16.4s, v29.4s, v6.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[3]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[1]\n"
+ "ldr q29, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v28.4s, v4.s[3]\n"
+ "fmla v13.4s, v28.4s, v5.s[0]\n"
+ "fmla v14.4s, v28.4s, v5.s[1]\n"
+ "fmla v15.4s, v28.4s, v5.s[2]\n"
+ "fmla v16.4s, v28.4s, v6.s[3]\n"
+ "fmla v17.4s, v28.4s, v7.s[0]\n"
+ "fmla v18.4s, v28.4s, v7.s[1]\n"
+ "fmla v19.4s, v28.4s, v7.s[2]\n"
+ "ldr q28, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v27.4s, v5.s[0]\n"
+ "fmla v13.4s, v27.4s, v5.s[1]\n"
+ "fmla v14.4s, v27.4s, v5.s[2]\n"
+ "fmla v15.4s, v27.4s, v5.s[3]\n"
+ "fmla v16.4s, v27.4s, v7.s[0]\n"
+ "fmla v17.4s, v27.4s, v7.s[1]\n"
+ "fmla v18.4s, v27.4s, v7.s[2]\n"
+ "fmla v19.4s, v27.4s, v7.s[3]\n"
+ "ldr q27, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v31.4s, v6.s[0]\n"
+ "fmla v13.4s, v31.4s, v6.s[1]\n"
+ "fmla v14.4s, v31.4s, v6.s[2]\n"
+ "fmla v15.4s, v31.4s, v6.s[3]\n"
+ "fmla v16.4s, v31.4s, v8.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[1]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v8.s[3]\n"
+ "ldr q31, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v30.4s, v6.s[1]\n"
+ "fmla v13.4s, v30.4s, v6.s[2]\n"
+ "fmla v14.4s, v30.4s, v6.s[3]\n"
+ "fmla v15.4s, v30.4s, v7.s[0]\n"
+ "fmla v16.4s, v30.4s, v8.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[2]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[0]\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "fmla v12.4s, v29.4s, v6.s[2]\n"
+ "fmla v13.4s, v29.4s, v6.s[3]\n"
+ "fmla v14.4s, v29.4s, v7.s[0]\n"
+ "fmla v15.4s, v29.4s, v7.s[1]\n"
+ "fmla v16.4s, v29.4s, v8.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[3]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[1]\n"
+ "ldr q29, [%x[params], #0x110]\n"
+ "fmla v12.4s, v28.4s, v6.s[3]\n"
+ "fmla v13.4s, v28.4s, v7.s[0]\n"
+ "fmla v14.4s, v28.4s, v7.s[1]\n"
+ "fmla v15.4s, v28.4s, v7.s[2]\n"
+ "fmla v16.4s, v28.4s, v8.s[3]\n"
+ "fmla v17.4s, v28.4s, v9.s[0]\n"
+ "fmla v18.4s, v28.4s, v9.s[1]\n"
+ "fmla v19.4s, v28.4s, v9.s[2]\n"
+ "ldr q28, [%x[params], #0x120]\n"
+ "fmla v12.4s, v27.4s, v7.s[0]\n"
+ "fmla v13.4s, v27.4s, v7.s[1]\n"
+ "fmla v14.4s, v27.4s, v7.s[2]\n"
+ "fmla v15.4s, v27.4s, v7.s[3]\n"
+ "fmla v16.4s, v27.4s, v9.s[0]\n"
+ "fmla v17.4s, v27.4s, v9.s[1]\n"
+ "fmla v18.4s, v27.4s, v9.s[2]\n"
+ "fmla v19.4s, v27.4s, v9.s[3]\n"
+ "ldr q27, [%x[params], #0x130]\n"
+ "add %x[params], %x[params], #0x140\n"
+ "fmla v12.4s, v31.4s, v8.s[0]\n"
+ "fmla v13.4s, v31.4s, v8.s[1]\n"
+ "fmla v14.4s, v31.4s, v8.s[2]\n"
+ "fmla v15.4s, v31.4s, v8.s[3]\n"
+ "fmla v16.4s, v31.4s, v10.s[0]\n"
+ "fmla v17.4s, v31.4s, v10.s[1]\n"
+ "fmla v18.4s, v31.4s, v10.s[2]\n"
+ "fmla v19.4s, v31.4s, v10.s[3]\n"
+ "fmla v12.4s, v30.4s, v8.s[1]\n"
+ "fmla v13.4s, v30.4s, v8.s[2]\n"
+ "fmla v14.4s, v30.4s, v8.s[3]\n"
+ "fmla v15.4s, v30.4s, v9.s[0]\n"
+ "fmla v16.4s, v30.4s, v10.s[1]\n"
+ "fmla v17.4s, v30.4s, v10.s[2]\n"
+ "fmla v18.4s, v30.4s, v10.s[3]\n"
+ "fmla v19.4s, v30.4s, v11.s[0]\n"
+ "fmla v12.4s, v29.4s, v8.s[2]\n"
+ "fmla v13.4s, v29.4s, v8.s[3]\n"
+ "fmla v14.4s, v29.4s, v9.s[0]\n"
+ "fmla v15.4s, v29.4s, v9.s[1]\n"
+ "fmla v16.4s, v29.4s, v10.s[2]\n"
+ "fmla v17.4s, v29.4s, v10.s[3]\n"
+ "fmla v18.4s, v29.4s, v11.s[0]\n"
+ "fmla v19.4s, v29.4s, v11.s[1]\n"
+ "fmla v12.4s, v28.4s, v8.s[3]\n"
+ "fmla v13.4s, v28.4s, v9.s[0]\n"
+ "fmla v14.4s, v28.4s, v9.s[1]\n"
+ "fmla v15.4s, v28.4s, v9.s[2]\n"
+ "fmla v16.4s, v28.4s, v10.s[3]\n"
+ "fmla v17.4s, v28.4s, v11.s[0]\n"
+ "fmla v18.4s, v28.4s, v11.s[1]\n"
+ "fmla v19.4s, v28.4s, v11.s[2]\n"
+ "fmla v12.4s, v27.4s, v9.s[0]\n"
+ "fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmla v14.4s, v27.4s, v9.s[2]\n"
+ "fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmla v16.4s, v27.4s, v11.s[0]\n"
+ "fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmla v18.4s, v27.4s, v11.s[2]\n"
+ "fmla v19.4s, v27.4s, v11.s[3]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "str q12, [x13, x25]\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
+ "str q13, [x12, x25]\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "str q14, [x10, x25]\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v21.4s\n"
+ "str q15, [x9, x25]\n"
+ "fmax v16.4s, v16.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "str q16, [x27, x25]\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "str q17, [x26, x25]\n"
+ "fmax v19.4s, v19.4s, v21.4s\n"
+ "str q18, [x24, x25]\n"
+ "str q19, [x23, x25]\n"
+ "add x25, x25, #0x10\n"
+ "3:" // Output channel oddments
+ "tst %x[channel_multiplier], #0x3\n"
+ "beq 6f\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "mov v13.16b, v12.16b\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "mov v14.16b, v12.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "mov v15.16b, v12.16b\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "mov v16.16b, v12.16b\n"
+ "ldr q28, [%x[params], #0x40]\n"
+ "mov v17.16b, v12.16b\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v19.16b, v12.16b\n"
+ "fmla v12.4s, v31.4s, v0.s[0]\n"
+ "fmla v13.4s, v31.4s, v0.s[1]\n"
+ "fmla v14.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v31.4s, v0.s[3]\n"
+ "fmla v16.4s, v31.4s, v2.s[0]\n"
+ "fmla v17.4s, v31.4s, v2.s[1]\n"
+ "fmla v18.4s, v31.4s, v2.s[2]\n"
+ "fmla v19.4s, v31.4s, v2.s[3]\n"
+ "ldr q31, [%x[params], #0x60]\n"
+ "fmla v12.4s, v30.4s, v0.s[1]\n"
+ "fmla v13.4s, v30.4s, v0.s[2]\n"
+ "fmla v14.4s, v30.4s, v0.s[3]\n"
+ "fmla v15.4s, v30.4s, v1.s[0]\n"
+ "fmla v16.4s, v30.4s, v2.s[1]\n"
+ "fmla v17.4s, v30.4s, v2.s[2]\n"
+ "fmla v18.4s, v30.4s, v2.s[3]\n"
+ "fmla v19.4s, v30.4s, v3.s[0]\n"
+ "ldr q30, [%x[params], #0x70]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v13.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v15.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v17.4s, v29.4s, v2.s[3]\n"
+ "fmla v18.4s, v29.4s, v3.s[0]\n"
+ "fmla v19.4s, v29.4s, v3.s[1]\n"
+ "ldr q29, [%x[params], #0x80]\n"
+ "fmla v12.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v14.4s, v28.4s, v1.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v16.4s, v28.4s, v2.s[3]\n"
+ "fmla v17.4s, v28.4s, v3.s[0]\n"
+ "fmla v18.4s, v28.4s, v3.s[1]\n"
+ "fmla v19.4s, v28.4s, v3.s[2]\n"
+ "ldr q28, [%x[params], #0x90]\n"
+ "fmla v12.4s, v27.4s, v1.s[0]\n"
+ "fmla v13.4s, v27.4s, v1.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[2]\n"
+ "fmla v15.4s, v27.4s, v1.s[3]\n"
+ "fmla v16.4s, v27.4s, v3.s[0]\n"
+ "fmla v17.4s, v27.4s, v3.s[1]\n"
+ "fmla v18.4s, v27.4s, v3.s[2]\n"
+ "fmla v19.4s, v27.4s, v3.s[3]\n"
+ "ldr q27, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v31.4s, v2.s[0]\n"
+ "fmla v13.4s, v31.4s, v2.s[1]\n"
+ "fmla v14.4s, v31.4s, v2.s[2]\n"
+ "fmla v15.4s, v31.4s, v2.s[3]\n"
+ "fmla v16.4s, v31.4s, v4.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[1]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v4.s[3]\n"
+ "ldr q31, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v30.4s, v2.s[1]\n"
+ "fmla v13.4s, v30.4s, v2.s[2]\n"
+ "fmla v14.4s, v30.4s, v2.s[3]\n"
+ "fmla v15.4s, v30.4s, v3.s[0]\n"
+ "fmla v16.4s, v30.4s, v4.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[2]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[0]\n"
+ "ldr q30, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v29.4s, v2.s[2]\n"
+ "fmla v13.4s, v29.4s, v2.s[3]\n"
+ "fmla v14.4s, v29.4s, v3.s[0]\n"
+ "fmla v15.4s, v29.4s, v3.s[1]\n"
+ "fmla v16.4s, v29.4s, v4.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[3]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[1]\n"
+ "ldr q29, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v28.4s, v2.s[3]\n"
+ "fmla v13.4s, v28.4s, v3.s[0]\n"
+ "fmla v14.4s, v28.4s, v3.s[1]\n"
+ "fmla v15.4s, v28.4s, v3.s[2]\n"
+ "fmla v16.4s, v28.4s, v4.s[3]\n"
+ "fmla v17.4s, v28.4s, v5.s[0]\n"
+ "fmla v18.4s, v28.4s, v5.s[1]\n"
+ "fmla v19.4s, v28.4s, v5.s[2]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v27.4s, v3.s[0]\n"
+ "fmla v13.4s, v27.4s, v3.s[1]\n"
+ "fmla v14.4s, v27.4s, v3.s[2]\n"
+ "fmla v15.4s, v27.4s, v3.s[3]\n"
+ "fmla v16.4s, v27.4s, v5.s[0]\n"
+ "fmla v17.4s, v27.4s, v5.s[1]\n"
+ "fmla v18.4s, v27.4s, v5.s[2]\n"
+ "fmla v19.4s, v27.4s, v5.s[3]\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v31.4s, v4.s[0]\n"
+ "fmla v13.4s, v31.4s, v4.s[1]\n"
+ "fmla v14.4s, v31.4s, v4.s[2]\n"
+ "fmla v15.4s, v31.4s, v4.s[3]\n"
+ "fmla v16.4s, v31.4s, v6.s[0]\n"
+ "fmla v17.4s, v31.4s, v6.s[1]\n"
+ "fmla v18.4s, v31.4s, v6.s[2]\n"
+ "fmla v19.4s, v31.4s, v6.s[3]\n"
+ "ldr q31, [%x[params], #0x100]\n"
+ "fmla v12.4s, v30.4s, v4.s[1]\n"
+ "fmla v13.4s, v30.4s, v4.s[2]\n"
+ "fmla v14.4s, v30.4s, v4.s[3]\n"
+ "fmla v15.4s, v30.4s, v5.s[0]\n"
+ "fmla v16.4s, v30.4s, v6.s[1]\n"
+ "fmla v17.4s, v30.4s, v6.s[2]\n"
+ "fmla v18.4s, v30.4s, v6.s[3]\n"
+ "fmla v19.4s, v30.4s, v7.s[0]\n"
+ "ldr q30, [%x[params], #0x110]\n"
+ "fmla v12.4s, v29.4s, v4.s[2]\n"
+ "fmla v13.4s, v29.4s, v4.s[3]\n"
+ "fmla v14.4s, v29.4s, v5.s[0]\n"
+ "fmla v15.4s, v29.4s, v5.s[1]\n"
+ "fmla v16.4s, v29.4s, v6.s[2]\n"
+ "fmla v17.4s, v29.4s, v6.s[3]\n"
+ "fmla v18.4s, v29.4s, v7.s[0]\n"
+ "fmla v19.4s, v29.4s, v7.s[1]\n"
+ "ldr q29, [%x[params], #0x120]\n"
+ "fmla v12.4s, v28.4s, v4.s[3]\n"
+ "fmla v13.4s, v28.4s, v5.s[0]\n"
+ "fmla v14.4s, v28.4s, v5.s[1]\n"
+ "fmla v15.4s, v28.4s, v5.s[2]\n"
+ "fmla v16.4s, v28.4s, v6.s[3]\n"
+ "fmla v17.4s, v28.4s, v7.s[0]\n"
+ "fmla v18.4s, v28.4s, v7.s[1]\n"
+ "fmla v19.4s, v28.4s, v7.s[2]\n"
+ "ldr q28, [%x[params], #0x130]\n"
+ "fmla v12.4s, v27.4s, v5.s[0]\n"
+ "fmla v13.4s, v27.4s, v5.s[1]\n"
+ "fmla v14.4s, v27.4s, v5.s[2]\n"
+ "fmla v15.4s, v27.4s, v5.s[3]\n"
+ "fmla v16.4s, v27.4s, v7.s[0]\n"
+ "fmla v17.4s, v27.4s, v7.s[1]\n"
+ "fmla v18.4s, v27.4s, v7.s[2]\n"
+ "fmla v19.4s, v27.4s, v7.s[3]\n"
+ "ldr q27, [%x[params], #0x140]\n"
+ "fmla v12.4s, v31.4s, v6.s[0]\n"
+ "fmla v13.4s, v31.4s, v6.s[1]\n"
+ "fmla v14.4s, v31.4s, v6.s[2]\n"
+ "fmla v15.4s, v31.4s, v6.s[3]\n"
+ "fmla v16.4s, v31.4s, v8.s[0]\n"
+ "fmla v17.4s, v31.4s, v8.s[1]\n"
+ "fmla v18.4s, v31.4s, v8.s[2]\n"
+ "fmla v19.4s, v31.4s, v8.s[3]\n"
+ "ldr q31, [%x[params], #0x150]\n"
+ "fmla v12.4s, v30.4s, v6.s[1]\n"
+ "fmla v13.4s, v30.4s, v6.s[2]\n"
+ "fmla v14.4s, v30.4s, v6.s[3]\n"
+ "fmla v15.4s, v30.4s, v7.s[0]\n"
+ "fmla v16.4s, v30.4s, v8.s[1]\n"
+ "fmla v17.4s, v30.4s, v8.s[2]\n"
+ "fmla v18.4s, v30.4s, v8.s[3]\n"
+ "fmla v19.4s, v30.4s, v9.s[0]\n"
+ "ldr q30, [%x[params], #0x160]\n"
+ "fmla v12.4s, v29.4s, v6.s[2]\n"
+ "fmla v13.4s, v29.4s, v6.s[3]\n"
+ "fmla v14.4s, v29.4s, v7.s[0]\n"
+ "fmla v15.4s, v29.4s, v7.s[1]\n"
+ "fmla v16.4s, v29.4s, v8.s[2]\n"
+ "fmla v17.4s, v29.4s, v8.s[3]\n"
+ "fmla v18.4s, v29.4s, v9.s[0]\n"
+ "fmla v19.4s, v29.4s, v9.s[1]\n"
+ "ldr q29, [%x[params], #0x170]\n"
+ "fmla v12.4s, v28.4s, v6.s[3]\n"
+ "fmla v13.4s, v28.4s, v7.s[0]\n"
+ "fmla v14.4s, v28.4s, v7.s[1]\n"
+ "fmla v15.4s, v28.4s, v7.s[2]\n"
+ "fmla v16.4s, v28.4s, v8.s[3]\n"
+ "fmla v17.4s, v28.4s, v9.s[0]\n"
+ "fmla v18.4s, v28.4s, v9.s[1]\n"
+ "fmla v19.4s, v28.4s, v9.s[2]\n"
+ "ldr q28, [%x[params], #0x180]\n"
+ "fmla v12.4s, v27.4s, v7.s[0]\n"
+ "fmla v13.4s, v27.4s, v7.s[1]\n"
+ "fmla v14.4s, v27.4s, v7.s[2]\n"
+ "fmla v15.4s, v27.4s, v7.s[3]\n"
+ "fmla v16.4s, v27.4s, v9.s[0]\n"
+ "fmla v17.4s, v27.4s, v9.s[1]\n"
+ "fmla v18.4s, v27.4s, v9.s[2]\n"
+ "fmla v19.4s, v27.4s, v9.s[3]\n"
+ "ldr q27, [%x[params], #0x190]\n"
+ "add %x[params], %x[params], #0x1a0\n"
+ "fmla v12.4s, v31.4s, v8.s[0]\n"
+ "fmla v13.4s, v31.4s, v8.s[1]\n"
+ "fmla v14.4s, v31.4s, v8.s[2]\n"
+ "fmla v15.4s, v31.4s, v8.s[3]\n"
+ "fmla v16.4s, v31.4s, v10.s[0]\n"
+ "fmla v17.4s, v31.4s, v10.s[1]\n"
+ "fmla v18.4s, v31.4s, v10.s[2]\n"
+ "fmla v19.4s, v31.4s, v10.s[3]\n"
+ "fmla v12.4s, v30.4s, v8.s[1]\n"
+ "fmla v13.4s, v30.4s, v8.s[2]\n"
+ "fmla v14.4s, v30.4s, v8.s[3]\n"
+ "fmla v15.4s, v30.4s, v9.s[0]\n"
+ "fmla v16.4s, v30.4s, v10.s[1]\n"
+ "fmla v17.4s, v30.4s, v10.s[2]\n"
+ "fmla v18.4s, v30.4s, v10.s[3]\n"
+ "fmla v19.4s, v30.4s, v11.s[0]\n"
+ "fmla v12.4s, v29.4s, v8.s[2]\n"
+ "fmla v13.4s, v29.4s, v8.s[3]\n"
+ "fmla v14.4s, v29.4s, v9.s[0]\n"
+ "fmla v15.4s, v29.4s, v9.s[1]\n"
+ "fmla v16.4s, v29.4s, v10.s[2]\n"
+ "fmla v17.4s, v29.4s, v10.s[3]\n"
+ "fmla v18.4s, v29.4s, v11.s[0]\n"
+ "fmla v19.4s, v29.4s, v11.s[1]\n"
+ "fmla v12.4s, v28.4s, v8.s[3]\n"
+ "fmla v13.4s, v28.4s, v9.s[0]\n"
+ "fmla v14.4s, v28.4s, v9.s[1]\n"
+ "fmla v15.4s, v28.4s, v9.s[2]\n"
+ "fmla v16.4s, v28.4s, v10.s[3]\n"
+ "fmla v17.4s, v28.4s, v11.s[0]\n"
+ "fmla v18.4s, v28.4s, v11.s[1]\n"
+ "fmla v19.4s, v28.4s, v11.s[2]\n"
+ "fmla v12.4s, v27.4s, v9.s[0]\n"
+ "fmla v13.4s, v27.4s, v9.s[1]\n"
+ "fmla v14.4s, v27.4s, v9.s[2]\n"
+ "fmla v15.4s, v27.4s, v9.s[3]\n"
+ "fmla v16.4s, v27.4s, v11.s[0]\n"
+ "fmla v17.4s, v27.4s, v11.s[1]\n"
+ "fmla v18.4s, v27.4s, v11.s[2]\n"
+ "fmla v19.4s, v27.4s, v11.s[3]\n"
+ "fmin v12.4s, v12.4s, v20.4s\n"
+ "fmin v13.4s, v13.4s, v20.4s\n"
+ "fmin v14.4s, v14.4s, v20.4s\n"
+ "fmax v12.4s, v12.4s, v21.4s\n"
+ "fmax v13.4s, v13.4s, v21.4s\n"
+ "fmax v14.4s, v14.4s, v21.4s\n"
+ "fmin v15.4s, v15.4s, v20.4s\n"
+ "fmin v16.4s, v16.4s, v20.4s\n"
+ "fmin v17.4s, v17.4s, v20.4s\n"
+ "fmax v15.4s, v15.4s, v21.4s\n"
+ "fmax v16.4s, v16.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmin v18.4s, v18.4s, v20.4s\n"
+ "fmin v19.4s, v19.4s, v20.4s\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmax v19.4s, v19.4s, v21.4s\n"
+ "tbz %x[channel_multiplier], #1, 4f\n"
+ "add x19, x13, x25\n"
+ "st1 { v12.d }[0], [x19]\n"
+ "add x19, x12, x25\n"
+ "st1 { v13.d }[0], [x19]\n"
+ "add x19, x10, x25\n"
+ "st1 { v14.d }[0], [x19]\n"
+ "add x19, x9, x25\n"
+ "st1 { v15.d }[0], [x19]\n"
+ "add x19, x27, x25\n"
+ "st1 { v16.d }[0], [x19]\n"
+ "add x19, x26, x25\n"
+ "st1 { v17.d }[0], [x19]\n"
+ "add x19, x24, x25\n"
+ "st1 { v18.d }[0], [x19]\n"
+ "add x19, x23, x25\n"
+ "st1 { v19.d }[0], [x19]\n"
+ "add x25, x25, #0x8\n"
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x19, x13, x25\n"
+ "st1 { v12.s }[2], [x19]\n"
+ "add x19, x12, x25\n"
+ "st1 { v13.s }[2], [x19]\n"
+ "add x19, x10, x25\n"
+ "st1 { v14.s }[2], [x19]\n"
+ "add x19, x9, x25\n"
+ "st1 { v15.s }[2], [x19]\n"
+ "add x19, x27, x25\n"
+ "st1 { v16.s }[2], [x19]\n"
+ "add x19, x26, x25\n"
+ "st1 { v17.s }[2], [x19]\n"
+ "add x19, x24, x25\n"
+ "st1 { v18.s }[2], [x19]\n"
+ "add x19, x23, x25\n"
+ "st1 { v19.s }[2], [x19]\n"
+ "b 5f\n"
+ "4:" // Output channel oddments: Store: Bit 1: Unset
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x19, x13, x25\n"
+ "st1 { v12.s }[0], [x19]\n"
+ "add x19, x12, x25\n"
+ "st1 { v13.s }[0], [x19]\n"
+ "add x19, x10, x25\n"
+ "st1 { v14.s }[0], [x19]\n"
+ "add x19, x9, x25\n"
+ "st1 { v15.s }[0], [x19]\n"
+ "add x19, x27, x25\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "add x19, x26, x25\n"
+ "st1 { v17.s }[0], [x19]\n"
+ "add x19, x24, x25\n"
+ "st1 { v18.s }[0], [x19]\n"
+ "add x19, x23, x25\n"
+ "st1 { v19.s }[0], [x19]\n"
+ "5:" // Output channel oddments: Store: Bit 1: End
+
+ "6:" // End
+
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2cc2f7c103
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 2; };
+
+ kern_type kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c93037d183
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const float *weights,
+ const float *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v11.4s }, [%x[minmax_vals]]\n"
+ "mov x10, #0x0\n"
+ "add x19, %x[minmax_vals], #0x4\n"
+ "ld1r { v10.4s }, [x19]\n"
+ "lsr x9, %x[n_output_channels], #0x2\n"
+ "cbz x9, 8f\n"
+ "1:" // Output channel loop
+ "movi v16.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x10, #0x2\n"
+ "ldr q16, [%x[bias], x19]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v9.16b, v16.16b\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "mov x19, %x[inptrs]\n"
+ "mov v7.16b, v16.16b\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "mov v6.16b, v16.16b\n"
+ "ldr q5, [x24, #0x0]\n"
+ "mov v4.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q2, [x24, #0x10]\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr q0, [x28, #0x0]\n"
+ "mov v31.16b, v16.16b\n"
+ "ldr q30, [x28, #0x10]\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "cbz x20, 6f\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "ldr q20, [%x[weights], #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q19, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q17, [x28, #0x0]\n"
+ "ldr q16, [x28, #0x10]\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr q2, [x24, #0x10]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "ldr q30, [x28, #0x10]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "ldr q19, [x24, #0x0]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "ldr q17, [x28, #0x0]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "ldr q16, [x28, #0x10]\n"
+ "ldr q20, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "lsl x27, x10, #0x2\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "fmin v9.4s, v9.4s, v10.4s\n"
+ "fmin v7.4s, v7.4s, v10.4s\n"
+ "fmin v6.4s, v6.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v11.4s\n"
+ "str q9, [x19, x27]\n"
+ "fmax v7.4s, v7.4s, v11.4s\n"
+ "fmax v6.4s, v6.4s, v11.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v4.4s, v4.4s, v10.4s\n"
+ "str q7, [x20, x27]\n"
+ "fmin v3.4s, v3.4s, v10.4s\n"
+ "fmin v1.4s, v1.4s, v10.4s\n"
+ "str q6, [x21, x27]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "str q4, [x22, x27]\n"
+ "fmax v1.4s, v1.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q3, [x23, x27]\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q1, [x24, x27]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "str q31, [x25, x27]\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q29, [x26, x27]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "str q28, [x19, x27]\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q27, [x20, x27]\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "str q26, [x21, x27]\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "str q25, [x22, x27]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "str q24, [x23, x27]\n"
+ "fmin v21.4s, v21.4s, v10.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q23, [x24, x27]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v21.4s, v21.4s, v11.4s\n"
+ "str q22, [x25, x27]\n"
+ "str q21, [x26, x27]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "lsl x27, x10, #0x2\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr q2, [x24, #0x10]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "ldr q30, [x28, #0x10]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmin v9.4s, v9.4s, v10.4s\n"
+ "fmin v7.4s, v7.4s, v10.4s\n"
+ "fmin v6.4s, v6.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v11.4s\n"
+ "str q9, [x19, x27]\n"
+ "fmax v7.4s, v7.4s, v11.4s\n"
+ "fmax v6.4s, v6.4s, v11.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v4.4s, v4.4s, v10.4s\n"
+ "str q7, [x20, x27]\n"
+ "fmin v3.4s, v3.4s, v10.4s\n"
+ "fmin v1.4s, v1.4s, v10.4s\n"
+ "str q6, [x21, x27]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "str q4, [x22, x27]\n"
+ "fmax v1.4s, v1.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q3, [x23, x27]\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q1, [x24, x27]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "str q31, [x25, x27]\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q29, [x26, x27]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "str q28, [x19, x27]\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q27, [x20, x27]\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "str q26, [x21, x27]\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "str q25, [x22, x27]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "str q24, [x23, x27]\n"
+ "fmin v21.4s, v21.4s, v10.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q23, [x24, x27]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v21.4s, v21.4s, v11.4s\n"
+ "str q22, [x25, x27]\n"
+ "str q21, [x26, x27]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "lsl x27, x10, #0x2\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmin v9.4s, v9.4s, v10.4s\n"
+ "fmin v7.4s, v7.4s, v10.4s\n"
+ "fmin v6.4s, v6.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v11.4s\n"
+ "str q9, [x19, x27]\n"
+ "fmax v7.4s, v7.4s, v11.4s\n"
+ "fmax v6.4s, v6.4s, v11.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmin v4.4s, v4.4s, v10.4s\n"
+ "str q7, [x20, x27]\n"
+ "fmin v3.4s, v3.4s, v10.4s\n"
+ "fmin v1.4s, v1.4s, v10.4s\n"
+ "str q6, [x21, x27]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "str q4, [x22, x27]\n"
+ "fmax v1.4s, v1.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "str q3, [x23, x27]\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "str q1, [x24, x27]\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "str q31, [x25, x27]\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "str q29, [x26, x27]\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "str q28, [x19, x27]\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "str q27, [x20, x27]\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "str q26, [x21, x27]\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "str q25, [x22, x27]\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "str q24, [x23, x27]\n"
+ "fmin v21.4s, v21.4s, v10.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "str q23, [x24, x27]\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v21.4s, v21.4s, v11.4s\n"
+ "str q22, [x25, x27]\n"
+ "str q21, [x26, x27]\n"
+ "7:" // Output channel loop: Done
+ "add x10, x10, #0x4\n"
+ "cmp x10, x9, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 19f\n"
+ "8:" // Output channel oddments
+ "movi v16.16b, #0x0\n"
+ "cbz %x[bias], 11f\n"
+ "add x19, %x[bias], x10, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 9f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 10f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Output channel oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 10f\n"
+ "ld1 { v16.s }[0], [x19]\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: End
+
+ "11:" // Output channel oddments: Load bias: Done
+ "mov v9.16b, v16.16b\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "mov x19, %x[inptrs]\n"
+ "mov v7.16b, v16.16b\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "mov v6.16b, v16.16b\n"
+ "ldr q5, [x24, #0x0]\n"
+ "mov v4.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q2, [x24, #0x10]\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr q0, [x28, #0x0]\n"
+ "mov v31.16b, v16.16b\n"
+ "ldr q30, [x28, #0x10]\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "cbz x20, 15f\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "ldr q20, [%x[weights], #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q19, [x24, #0x0]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "ldr q17, [x28, #0x0]\n"
+ "ldr q16, [x28, #0x10]\n"
+ "beq 13f\n"
+ "12:" // Output channel oddments: Kernel loop
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr q2, [x24, #0x10]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "ldr q30, [x28, #0x10]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "ldr q19, [x24, #0x0]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "ldr q18, [x24, #0x10]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "ldr q17, [x28, #0x0]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "ldr q16, [x28, #0x10]\n"
+ "ldr q20, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 12b\n"
+ "13:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 14f\n"
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "b 16f\n"
+ "14:" // Output channel oddments: Odd tail
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "ldp x24, x28, [x19], #0x10\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "ldr q2, [x24, #0x10]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "ldr q0, [x28, #0x0]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "ldr q30, [x28, #0x10]\n"
+ "fmla v9.4s, v20.4s, v19.s[0]\n"
+ "ldr q8, [%x[weights], #0x0]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v7.4s, v20.4s, v19.s[1]\n"
+ "fmla v6.4s, v20.4s, v19.s[2]\n"
+ "fmla v4.4s, v20.4s, v19.s[3]\n"
+ "fmla v3.4s, v20.4s, v18.s[0]\n"
+ "fmla v1.4s, v20.4s, v18.s[1]\n"
+ "fmla v31.4s, v20.4s, v18.s[2]\n"
+ "fmla v29.4s, v20.4s, v18.s[3]\n"
+ "fmla v28.4s, v20.4s, v17.s[0]\n"
+ "fmla v27.4s, v20.4s, v17.s[1]\n"
+ "fmla v26.4s, v20.4s, v17.s[2]\n"
+ "fmla v25.4s, v20.4s, v17.s[3]\n"
+ "fmla v24.4s, v20.4s, v16.s[0]\n"
+ "fmla v23.4s, v20.4s, v16.s[1]\n"
+ "fmla v22.4s, v20.4s, v16.s[2]\n"
+ "fmla v21.4s, v20.4s, v16.s[3]\n"
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "b 16f\n"
+ "15:" // Output channel oddments: Single kernel point
+ "fmla v9.4s, v8.4s, v5.s[0]\n"
+ "fmla v7.4s, v8.4s, v5.s[1]\n"
+ "fmla v6.4s, v8.4s, v5.s[2]\n"
+ "fmla v4.4s, v8.4s, v5.s[3]\n"
+ "fmla v3.4s, v8.4s, v2.s[0]\n"
+ "fmla v1.4s, v8.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v2.s[2]\n"
+ "fmla v29.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v0.s[0]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "fmla v26.4s, v8.4s, v0.s[2]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "fmla v24.4s, v8.4s, v30.s[0]\n"
+ "fmla v23.4s, v8.4s, v30.s[1]\n"
+ "fmla v22.4s, v8.4s, v30.s[2]\n"
+ "fmla v21.4s, v8.4s, v30.s[3]\n"
+ "16:" // Output channel oddments: Done
+ "fmin v9.4s, v9.4s, v10.4s\n"
+ "fmin v7.4s, v7.4s, v10.4s\n"
+ "fmin v6.4s, v6.4s, v10.4s\n"
+ "fmin v4.4s, v4.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v11.4s\n"
+ "fmax v7.4s, v7.4s, v11.4s\n"
+ "fmax v6.4s, v6.4s, v11.4s\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "fmin v3.4s, v3.4s, v10.4s\n"
+ "fmin v1.4s, v1.4s, v10.4s\n"
+ "fmin v31.4s, v31.4s, v10.4s\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "fmax v1.4s, v1.4s, v11.4s\n"
+ "fmax v31.4s, v31.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v10.4s\n"
+ "fmin v28.4s, v28.4s, v10.4s\n"
+ "fmin v27.4s, v27.4s, v10.4s\n"
+ "fmax v29.4s, v29.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v11.4s\n"
+ "fmax v27.4s, v27.4s, v11.4s\n"
+ "fmin v26.4s, v26.4s, v10.4s\n"
+ "fmin v25.4s, v25.4s, v10.4s\n"
+ "fmin v24.4s, v24.4s, v10.4s\n"
+ "fmax v26.4s, v26.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v11.4s\n"
+ "fmax v24.4s, v24.4s, v11.4s\n"
+ "fmin v23.4s, v23.4s, v10.4s\n"
+ "fmin v22.4s, v22.4s, v10.4s\n"
+ "fmin v21.4s, v21.4s, v10.4s\n"
+ "fmax v23.4s, v23.4s, v11.4s\n"
+ "fmax v22.4s, v22.4s, v11.4s\n"
+ "fmax v21.4s, v21.4s, v11.4s\n"
+ "tbz %x[n_output_channels], #1, 17f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v9.d }[0], [x19]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v7.d }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v6.d }[0], [x21]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v4.d }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v3.d }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v1.d }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v31.d }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "st1 { v29.d }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v28.d }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v23.d }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v22.d }[0], [x25]\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v21.d }[0], [x26]\n"
+ "tbz %x[n_output_channels], #0, 18f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v9.s }[2], [x19]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v7.s }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v6.s }[2], [x21]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v4.s }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v3.s }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v1.s }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v31.s }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "st1 { v29.s }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v23.s }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v21.s }[2], [x26]\n"
+ "b 18f\n"
+ "17:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 18f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v9.s }[0], [x19]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v7.s }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v6.s }[0], [x21]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v4.s }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v3.s }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v1.s }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v31.s }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x10, LSL #2\n"
+ "st1 { v29.s }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v23.s }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v22.s }[0], [x25]\n"
+ "st1 { v21.s }[0], [x26]\n"
+ "18:" // Output channel oddments: Done: Store: Bit 1: End
+
+ "19:" // Done
+
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..c76cb9906f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ed8cd4861e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x13, x12, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "ldp x11, x10, [%x[inptrs], #0x10]\n"
+ "mov x19, #0x1\n"
+ "ldp x9, x28, [%x[inptrs], #0x20]\n"
+ "orr x19, x19, #0x100\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "orr x19, x19, #0x10000\n"
+ "dup v11.4s, w19\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "mov x23, #0x0\n"
+ "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "lsr x20, %x[n_channels], #0x4\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v9.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "cbz x20, 2f\n"
+ "1:" // Loop
+ "movi v15.4s, #0x0\n"
+ "ldr q27, [x13, x23]\n"
+ "subs x20, x20, #0x1\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q1, [x12, x23]\n"
+ "ldp x13, x12, [%x[inptrs], #0x40]\n"
+ "ldr q25, [x11, x23]\n"
+ "zip1 v7.16b, v27.16b, v25.16b\n"
+ "ldr q23, [x10, x23]\n"
+ "zip2 v5.16b, v27.16b, v25.16b\n"
+ "ldp x11, x10, [%x[inptrs], #0x50]\n"
+ "ldr q31, [x9, x23]\n"
+ "zip1 v8.16b, v1.16b, v23.16b\n"
+ "ldr q28, [x28, x23]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldp x9, x28, [%x[inptrs], #0x60]\n"
+ "zip1 v6.16b, v7.16b, v8.16b\n"
+ "ldr q21, [x27, x23]\n"
+ "zip2 v8.16b, v7.16b, v8.16b\n"
+ "ldr q26, [x26, x23]\n"
+ "zip1 v7.16b, v5.16b, v3.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "zip2 v5.16b, v5.16b, v3.16b\n"
+ "ldr q24, [x13, x23]\n"
+ "ldr q22, [x12, x23]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldp x13, x12, [%x[inptrs], #0x0]\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "ldr q20, [x11, x23]\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "ldr q16, [x10, x23]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "ldp x11, x10, [%x[inptrs], #0x10]\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "ldr q19, [x9, x23]\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "ldr q0, [x28, x23]\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "ldp x9, x28, [%x[inptrs], #0x20]\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "ldr q18, [x27, x23]\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "ldr q17, [x26, x23]\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q7, [SP, #0x0]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x70]\n"
+ ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ "ldr q3, [SP, #0x20]\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x80]\n"
+ ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ "ldr q31, [SP, #0x40]\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x90]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
+ "ldr q6, [SP, #0x0]\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "ldr q26, [SP, #0x60]\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "movi v15.4s, #0x0\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xa0]\n"
+ ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v17.16b, v15.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "ldr q30, [%x[params], #0x60]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0xb0]\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "movi v10.4s, #0x0\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0xd0]\n"
+ ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "ldr q2, [SP, #0x30]\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ "ldr q27, [%x[params], #0xe0]\n"
+ ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ "ldr q28, [SP, #0x50]\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "ldr q25, [%x[params], #0xf0]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
+ "ldr q8, [SP, #0x10]\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "ldr q24, [SP, #0x70]\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "movi v15.4s, #0x0\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
+ "movi v10.4s, #0x0\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "mov v17.16b, v15.16b\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x110]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "ldr q30, [%x[params], #0xc0]\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "str s20, [x22, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x130]\n"
+ ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x140]\n"
+ ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x150]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "movi v15.4s, #0x0\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "movi v10.4s, #0x0\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "mov v17.16b, v15.16b\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x170]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "add %x[params], %x[params], #0x180\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "add x23, x23, #0x4\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0xf\n"
+ "beq 34f\n"
+ "2:" // Oddments
+ "and x19, %x[n_channels], #0xf\n"
+ "add x13, x13, x23\n"
+ "add x12, x12, x23\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "add x9, x9, x23\n"
+ "add x28, x28, x23\n"
+ "add x27, x27, x23\n"
+ "add x26, x26, x23\n"
+ "tbz %x[n_channels], #3, 6f\n"
+ "ld1 { v27.d }[0], [x13], #0x8\n"
+ "ld1 { v1.d }[0], [x12], #0x8\n"
+ "ld1 { v25.d }[0], [x11], #0x8\n"
+ "ld1 { v23.d }[0], [x10], #0x8\n"
+ "ld1 { v31.d }[0], [x9], #0x8\n"
+ "ld1 { v28.d }[0], [x28], #0x8\n"
+ "ld1 { v21.d }[0], [x27], #0x8\n"
+ "ld1 { v26.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 4f\n"
+ "ld1 { v27.s }[2], [x13], #0x4\n"
+ "ld1 { v1.s }[2], [x12], #0x4\n"
+ "ld1 { v25.s }[2], [x11], #0x4\n"
+ "ld1 { v23.s }[2], [x10], #0x4\n"
+ "ld1 { v31.s }[2], [x9], #0x4\n"
+ "ld1 { v28.s }[2], [x28], #0x4\n"
+ "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 3f\n"
+ "ld1 { v27.h }[6], [x13], #0x2\n"
+ "ld1 { v1.h }[6], [x12], #0x2\n"
+ "ld1 { v25.h }[6], [x11], #0x2\n"
+ "ld1 { v23.h }[6], [x10], #0x2\n"
+ "ld1 { v31.h }[6], [x9], #0x2\n"
+ "ld1 { v28.h }[6], [x28], #0x2\n"
+ "ld1 { v21.h }[6], [x27], #0x2\n"
+ "ld1 { v26.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[14], [x13], #0x1\n"
+ "ld1 { v1.b }[14], [x12], #0x1\n"
+ "ld1 { v25.b }[14], [x11], #0x1\n"
+ "ld1 { v23.b }[14], [x10], #0x1\n"
+ "ld1 { v31.b }[14], [x9], #0x1\n"
+ "ld1 { v28.b }[14], [x28], #0x1\n"
+ "ld1 { v21.b }[14], [x27], #0x1\n"
+ "ld1 { v26.b }[14], [x26], #0x1\n"
+ "b 10f\n"
+ "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[12], [x13], #0x1\n"
+ "ld1 { v1.b }[12], [x12], #0x1\n"
+ "ld1 { v25.b }[12], [x11], #0x1\n"
+ "ld1 { v23.b }[12], [x10], #0x1\n"
+ "ld1 { v31.b }[12], [x9], #0x1\n"
+ "ld1 { v28.b }[12], [x28], #0x1\n"
+ "ld1 { v21.b }[12], [x27], #0x1\n"
+ "ld1 { v26.b }[12], [x26], #0x1\n"
+ "b 10f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v27.h }[4], [x13], #0x2\n"
+ "ld1 { v1.h }[4], [x12], #0x2\n"
+ "ld1 { v25.h }[4], [x11], #0x2\n"
+ "ld1 { v23.h }[4], [x10], #0x2\n"
+ "ld1 { v31.h }[4], [x9], #0x2\n"
+ "ld1 { v28.h }[4], [x28], #0x2\n"
+ "ld1 { v21.h }[4], [x27], #0x2\n"
+ "ld1 { v26.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[10], [x13], #0x1\n"
+ "ld1 { v1.b }[10], [x12], #0x1\n"
+ "ld1 { v25.b }[10], [x11], #0x1\n"
+ "ld1 { v23.b }[10], [x10], #0x1\n"
+ "ld1 { v31.b }[10], [x9], #0x1\n"
+ "ld1 { v28.b }[10], [x28], #0x1\n"
+ "ld1 { v21.b }[10], [x27], #0x1\n"
+ "ld1 { v26.b }[10], [x26], #0x1\n"
+ "b 10f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[8], [x13], #0x1\n"
+ "ld1 { v1.b }[8], [x12], #0x1\n"
+ "ld1 { v25.b }[8], [x11], #0x1\n"
+ "ld1 { v23.b }[8], [x10], #0x1\n"
+ "ld1 { v31.b }[8], [x9], #0x1\n"
+ "ld1 { v28.b }[8], [x28], #0x1\n"
+ "ld1 { v21.b }[8], [x27], #0x1\n"
+ "ld1 { v26.b }[8], [x26], #0x1\n"
+ "b 10f\n"
+ "6:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 8f\n"
+ "ld1 { v27.s }[0], [x13], #0x4\n"
+ "ld1 { v1.s }[0], [x12], #0x4\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "ld1 { v23.s }[0], [x10], #0x4\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v28.s }[0], [x28], #0x4\n"
+ "ld1 { v21.s }[0], [x27], #0x4\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.h }[2], [x13], #0x2\n"
+ "ld1 { v1.h }[2], [x12], #0x2\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "ld1 { v23.h }[2], [x10], #0x2\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "ld1 { v21.h }[2], [x27], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[6], [x13], #0x1\n"
+ "ld1 { v1.b }[6], [x12], #0x1\n"
+ "ld1 { v25.b }[6], [x11], #0x1\n"
+ "ld1 { v23.b }[6], [x10], #0x1\n"
+ "ld1 { v31.b }[6], [x9], #0x1\n"
+ "ld1 { v28.b }[6], [x28], #0x1\n"
+ "ld1 { v21.b }[6], [x27], #0x1\n"
+ "ld1 { v26.b }[6], [x26], #0x1\n"
+ "b 10f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[4], [x13], #0x1\n"
+ "ld1 { v1.b }[4], [x12], #0x1\n"
+ "ld1 { v25.b }[4], [x11], #0x1\n"
+ "ld1 { v23.b }[4], [x10], #0x1\n"
+ "ld1 { v31.b }[4], [x9], #0x1\n"
+ "ld1 { v28.b }[4], [x28], #0x1\n"
+ "ld1 { v21.b }[4], [x27], #0x1\n"
+ "ld1 { v26.b }[4], [x26], #0x1\n"
+ "b 10f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v27.h }[0], [x13], #0x2\n"
+ "ld1 { v1.h }[0], [x12], #0x2\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "ld1 { v23.h }[0], [x10], #0x2\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v28.h }[0], [x28], #0x2\n"
+ "ld1 { v21.h }[0], [x27], #0x2\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[2], [x13], #0x1\n"
+ "ld1 { v1.b }[2], [x12], #0x1\n"
+ "ld1 { v25.b }[2], [x11], #0x1\n"
+ "ld1 { v23.b }[2], [x10], #0x1\n"
+ "ld1 { v31.b }[2], [x9], #0x1\n"
+ "ld1 { v28.b }[2], [x28], #0x1\n"
+ "ld1 { v21.b }[2], [x27], #0x1\n"
+ "ld1 { v26.b }[2], [x26], #0x1\n"
+ "b 10f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[0], [x13], #0x1\n"
+ "ld1 { v1.b }[0], [x12], #0x1\n"
+ "ld1 { v25.b }[0], [x11], #0x1\n"
+ "ld1 { v23.b }[0], [x10], #0x1\n"
+ "ld1 { v31.b }[0], [x9], #0x1\n"
+ "ld1 { v28.b }[0], [x28], #0x1\n"
+ "ld1 { v21.b }[0], [x27], #0x1\n"
+ "ld1 { v26.b }[0], [x26], #0x1\n"
+ "10:" // Oddments: Load (A): Bit 3: End
+ "ldp x13, x12, [%x[inptrs], #0x40]\n"
+ "add x13, x13, x23\n"
+ "ldp x11, x10, [%x[inptrs], #0x50]\n"
+ "ldp x9, x28, [%x[inptrs], #0x60]\n"
+ "add x12, x12, x23\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "add x9, x9, x23\n"
+ "add x28, x28, x23\n"
+ "add x27, x27, x23\n"
+ "add x26, x26, x23\n"
+ "tbz %x[n_channels], #3, 14f\n"
+ "ld1 { v24.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "ld1 { v20.d }[0], [x11], #0x8\n"
+ "ld1 { v16.d }[0], [x10], #0x8\n"
+ "ld1 { v19.d }[0], [x9], #0x8\n"
+ "ld1 { v0.d }[0], [x28], #0x8\n"
+ "ld1 { v18.d }[0], [x27], #0x8\n"
+ "ld1 { v17.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 12f\n"
+ "ld1 { v24.s }[2], [x13], #0x4\n"
+ "ld1 { v22.s }[2], [x12], #0x4\n"
+ "ld1 { v20.s }[2], [x11], #0x4\n"
+ "ld1 { v16.s }[2], [x10], #0x4\n"
+ "ld1 { v19.s }[2], [x9], #0x4\n"
+ "ld1 { v0.s }[2], [x28], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ld1 { v24.h }[6], [x13], #0x2\n"
+ "ld1 { v22.h }[6], [x12], #0x2\n"
+ "ld1 { v20.h }[6], [x11], #0x2\n"
+ "ld1 { v16.h }[6], [x10], #0x2\n"
+ "ld1 { v19.h }[6], [x9], #0x2\n"
+ "ld1 { v0.h }[6], [x28], #0x2\n"
+ "ld1 { v18.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[14], [x13], #0x1\n"
+ "ld1 { v22.b }[14], [x12], #0x1\n"
+ "ld1 { v20.b }[14], [x11], #0x1\n"
+ "ld1 { v16.b }[14], [x10], #0x1\n"
+ "ld1 { v19.b }[14], [x9], #0x1\n"
+ "ld1 { v0.b }[14], [x28], #0x1\n"
+ "ld1 { v18.b }[14], [x27], #0x1\n"
+ "ld1 { v17.b }[14], [x26], #0x1\n"
+ "b 18f\n"
+ "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[12], [x13], #0x1\n"
+ "ld1 { v22.b }[12], [x12], #0x1\n"
+ "ld1 { v20.b }[12], [x11], #0x1\n"
+ "ld1 { v16.b }[12], [x10], #0x1\n"
+ "ld1 { v19.b }[12], [x9], #0x1\n"
+ "ld1 { v0.b }[12], [x28], #0x1\n"
+ "ld1 { v18.b }[12], [x27], #0x1\n"
+ "ld1 { v17.b }[12], [x26], #0x1\n"
+ "b 18f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v24.h }[4], [x13], #0x2\n"
+ "ld1 { v22.h }[4], [x12], #0x2\n"
+ "ld1 { v20.h }[4], [x11], #0x2\n"
+ "ld1 { v16.h }[4], [x10], #0x2\n"
+ "ld1 { v19.h }[4], [x9], #0x2\n"
+ "ld1 { v0.h }[4], [x28], #0x2\n"
+ "ld1 { v18.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[10], [x13], #0x1\n"
+ "ld1 { v22.b }[10], [x12], #0x1\n"
+ "ld1 { v20.b }[10], [x11], #0x1\n"
+ "ld1 { v16.b }[10], [x10], #0x1\n"
+ "ld1 { v19.b }[10], [x9], #0x1\n"
+ "ld1 { v0.b }[10], [x28], #0x1\n"
+ "ld1 { v18.b }[10], [x27], #0x1\n"
+ "ld1 { v17.b }[10], [x26], #0x1\n"
+ "b 18f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[8], [x13], #0x1\n"
+ "ld1 { v22.b }[8], [x12], #0x1\n"
+ "ld1 { v20.b }[8], [x11], #0x1\n"
+ "ld1 { v16.b }[8], [x10], #0x1\n"
+ "ld1 { v19.b }[8], [x9], #0x1\n"
+ "ld1 { v0.b }[8], [x28], #0x1\n"
+ "ld1 { v18.b }[8], [x27], #0x1\n"
+ "ld1 { v17.b }[8], [x26], #0x1\n"
+ "b 18f\n"
+ "14:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 16f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "ld1 { v22.s }[0], [x12], #0x4\n"
+ "ld1 { v20.s }[0], [x11], #0x4\n"
+ "ld1 { v16.s }[0], [x10], #0x4\n"
+ "ld1 { v19.s }[0], [x9], #0x4\n"
+ "ld1 { v0.s }[0], [x28], #0x4\n"
+ "ld1 { v18.s }[0], [x27], #0x4\n"
+ "ld1 { v17.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "ld1 { v22.h }[2], [x12], #0x2\n"
+ "ld1 { v20.h }[2], [x11], #0x2\n"
+ "ld1 { v16.h }[2], [x10], #0x2\n"
+ "ld1 { v19.h }[2], [x9], #0x2\n"
+ "ld1 { v0.h }[2], [x28], #0x2\n"
+ "ld1 { v18.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[6], [x13], #0x1\n"
+ "ld1 { v22.b }[6], [x12], #0x1\n"
+ "ld1 { v20.b }[6], [x11], #0x1\n"
+ "ld1 { v16.b }[6], [x10], #0x1\n"
+ "ld1 { v19.b }[6], [x9], #0x1\n"
+ "ld1 { v0.b }[6], [x28], #0x1\n"
+ "ld1 { v18.b }[6], [x27], #0x1\n"
+ "ld1 { v17.b }[6], [x26], #0x1\n"
+ "b 18f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[4], [x13], #0x1\n"
+ "ld1 { v22.b }[4], [x12], #0x1\n"
+ "ld1 { v20.b }[4], [x11], #0x1\n"
+ "ld1 { v16.b }[4], [x10], #0x1\n"
+ "ld1 { v19.b }[4], [x9], #0x1\n"
+ "ld1 { v0.b }[4], [x28], #0x1\n"
+ "ld1 { v18.b }[4], [x27], #0x1\n"
+ "ld1 { v17.b }[4], [x26], #0x1\n"
+ "b 18f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "ld1 { v22.h }[0], [x12], #0x2\n"
+ "ld1 { v20.h }[0], [x11], #0x2\n"
+ "ld1 { v16.h }[0], [x10], #0x2\n"
+ "ld1 { v19.h }[0], [x9], #0x2\n"
+ "ld1 { v0.h }[0], [x28], #0x2\n"
+ "ld1 { v18.h }[0], [x27], #0x2\n"
+ "ld1 { v17.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[2], [x13], #0x1\n"
+ "ld1 { v22.b }[2], [x12], #0x1\n"
+ "ld1 { v20.b }[2], [x11], #0x1\n"
+ "ld1 { v16.b }[2], [x10], #0x1\n"
+ "ld1 { v19.b }[2], [x9], #0x1\n"
+ "ld1 { v0.b }[2], [x28], #0x1\n"
+ "ld1 { v18.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "b 18f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[0], [x13], #0x1\n"
+ "ld1 { v22.b }[0], [x12], #0x1\n"
+ "ld1 { v20.b }[0], [x11], #0x1\n"
+ "ld1 { v16.b }[0], [x10], #0x1\n"
+ "ld1 { v19.b }[0], [x9], #0x1\n"
+ "ld1 { v0.b }[0], [x28], #0x1\n"
+ "ld1 { v18.b }[0], [x27], #0x1\n"
+ "ld1 { v17.b }[0], [x26], #0x1\n"
+ "18:" // Oddments: Load (B): Bit 3: End
+ "zip1 v7.16b, v27.16b, v25.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "cmp x19, #0x4\n"
+ "zip2 v5.16b, v27.16b, v25.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip1 v8.16b, v1.16b, v23.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "zip1 v6.16b, v7.16b, v8.16b\n"
+ "zip2 v8.16b, v7.16b, v8.16b\n"
+ "zip1 v7.16b, v5.16b, v3.16b\n"
+ "str q7, [SP, #0x0]\n"
+ "zip2 v5.16b, v5.16b, v3.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "movi v15.4s, #0x0\n"
+ ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ "movi v10.4s, #0x0\n"
+ ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 19f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 22f\n"
+ "19:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 20f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 21f\n"
+ "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+ "22:" // Oddments: Unroll 0: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "cmp x19, #0x4\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 23f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 26f\n"
+ "23:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 24f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 25f\n"
+ "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+ "26:" // Oddments: Unroll 1: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q6, [SP, #0x0]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q3, [SP, #0x20]\n"
+ "cmp x19, #0x4\n"
+ ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n"
+ "ldr q31, [SP, #0x40]\n"
+ "ldr q26, [SP, #0x60]\n"
+ ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 27f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 30f\n"
+ "27:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 28f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 29f\n"
+ "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+ "30:" // Oddments: Unroll 2: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q8, [SP, #0x10]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q2, [SP, #0x30]\n"
+ "ldr q28, [SP, #0x50]\n"
+ ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n"
+ "ldr q24, [SP, #0x70]\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "31:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 32f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 33f\n"
+ "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+ "34:" // End
+ "add SP, SP, #0x80\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..76c927abcb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3001276fb5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x17, #0x0\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x12, x8, #0x3\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v14.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v9.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "cbz x12, 3f\n"
+ "subs x12, x12, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q19, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v9.8b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "ldr d4, [x16, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "ssubl v6.8h, v6.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v9.8b\n"
+ "ssubl v8.8h, v8.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ldr d31, [x23, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "ldr d27, [x19, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "add x16, x16, #0x48\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "subs x12, x12, #0x1\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "ldr q10, [x11, #0x0]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "ldr q11, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x10, x15]\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x9, x15]\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x28, x15]\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q19, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v9.8b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "ldr d4, [x16, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "ssubl v6.8h, v6.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v9.8b\n"
+ "ssubl v8.8h, v8.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ldr d31, [x23, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "ldr d27, [x19, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "tst x8, #0x7\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "ldr q10, [x11, #0x0]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "ldr q11, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x10, x15]\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x9, x15]\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x28, x15]\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "beq 64f\n"
+ "add x16, x16, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v13.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v19.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v19.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v19.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x8, #1, 6f\n"
+ "ld1 { v13.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v13.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v17.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v16.16b, v13.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d4, [x16, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v9.8b\n"
+ "ssubl v7.8h, v7.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v9.8b\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "add x19, x19, x17\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x8, #1, 10f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x21, [x14, #0x28]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "add x21, x21, x17\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v31.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[6], [x21]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[4], [x21]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x8, #1, 14f\n"
+ "ld1 { v31.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[2], [x21]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[0], [x21]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "add x20, x20, x17\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x8, #1, 18f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr x26, [x14, #0x38]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x17\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x8, #1, 22f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "ldr x25, [x14, #0x40]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "add x25, x25, x17\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v31.s }[0], [x25], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v31.h }[2], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[6], [x25]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[4], [x25]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x8, #1, 26f\n"
+ "ld1 { v31.h }[0], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[2], [x25]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[0], [x25]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "ldr x19, [x14, #0x48]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "add x19, x19, x17\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v30.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v30.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x8, #1, 30f\n"
+ "ld1 { v30.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr x24, [x14, #0x50]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "add x24, x24, x17\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "ldr x23, [x14, #0x58]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "add x23, x23, x17\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x8, #1, 38f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "ldr x22, [x14, #0x60]\n"
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "add x22, x22, x17\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x8, #1, 42f\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "ldr x21, [x14, #0x68]\n"
+ "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "add x21, x21, x17\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x8, #1, 46f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr x20, [x14, #0x70]\n"
+ "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x8, #1, 50f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr x19, [x14, #0x78]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "add x19, x19, x17\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v28.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[6], [x19]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[4], [x19]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x8, #1, 54f\n"
+ "ld1 { v28.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[2], [x19]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[0], [x19]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v26.4s }, [x13], #0x10\n"
+ "ld1 { v10.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 56f\n"
+ "ld1 { v11.d }[0], [x13], #0x8\n"
+ "ld1 { v18.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v18.s }[2], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v11.s }[0], [x13]\n"
+ "ld1 { v18.s }[0], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x8, #1, 58f\n"
+ "ld1 { v26.d }[0], [x13], #0x8\n"
+ "ld1 { v10.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v26.s }[2], [x13]\n"
+ "ld1 { v10.s }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v26.s }[0], [x13]\n"
+ "ld1 { v10.s }[0], [x11]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "add x10, x10, x15\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "add x9, x9, x15\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "add x28, x28, x15\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "add x27, x27, x15\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v13.s }[0], [x10], #0x4\n"
+ "st1 { v17.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v13.h }[2], [x10], #0x2\n"
+ "st1 { v17.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[6], [x10], #0x1\n"
+ "st1 { v17.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[4], [x10], #0x1\n"
+ "st1 { v17.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x8, #1, 62f\n"
+ "st1 { v13.h }[0], [x10], #0x2\n"
+ "st1 { v17.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[2], [x10], #0x1\n"
+ "st1 { v17.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[0], [x10], #0x1\n"
+ "st1 { v17.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+
+ "64:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b20759eec4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3b3d9c8946
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x5, #0x0\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x7, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x8, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x16, x4, #0x3\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v11.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "cbz x16, 3f\n"
+ "subs x16, x16, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v15.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "ldr d4, [x6, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "ldr d7, [x6, #0x38]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "ldr d31, [x26, x5]\n"
+ "ssubl v31.8h, v31.8b, v12.8b\n"
+ "ldr d30, [x25, x5]\n"
+ "ldr d29, [x24, x5]\n"
+ "ssubl v30.8h, v30.8b, v12.8b\n"
+ "ldr d28, [x23, x5]\n"
+ "ldr d27, [x22, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ldr d26, [x21, x5]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "ldr d25, [x20, x5]\n"
+ "ldr d24, [x19, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x6, x6, #0x48\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "subs x16, x16, #0x1\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x8, #0x68]\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x8, #0x70]\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x8, #0x78]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x8, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x8, #0x88]\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x8, #0x90]\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x5]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x8, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x8, #0xa0]\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x8, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "ldr x21, [x8, #0xb0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x19, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "ldr x20, [x8, #0xb8]\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x8, #0xc0]\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "ldr q21, [x17, #0x0]\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "ldr d24, [x9, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "ldr q30, [x15, #0x0]\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "ldr q31, [x17, #0x10]\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x5]\n"
+ "add x17, x17, #0x20\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "ldr q9, [x15, #0x10]\n"
+ "add x15, x15, #0x20\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x5]\n"
+ "add x5, x5, #0x8\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x14, x7]\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d20, [x13, x7]\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x12, x7]\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x11, x7]\n"
+ "add x7, x7, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v15.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "ldr d4, [x6, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "ldr d7, [x6, #0x38]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "ldr d31, [x26, x5]\n"
+ "ssubl v31.8h, v31.8b, v12.8b\n"
+ "ldr d30, [x25, x5]\n"
+ "ldr d29, [x24, x5]\n"
+ "ssubl v30.8h, v30.8b, v12.8b\n"
+ "ldr d28, [x23, x5]\n"
+ "ldr d27, [x22, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "ldr d26, [x21, x5]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "ldr d25, [x20, x5]\n"
+ "ldr d24, [x19, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "ldr x23, [x8, #0x40]\n"
+ "tst x4, #0x7\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x8, #0x68]\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x8, #0x70]\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x8, #0x78]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x8, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x8, #0x88]\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x8, #0x90]\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x5]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x8, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x8, #0xa0]\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x8, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "ldr x21, [x8, #0xb0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x19, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "ldr x20, [x8, #0xb8]\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x8, #0xc0]\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "ldr q21, [x17, #0x0]\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "ldr d24, [x9, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "ldr q30, [x15, #0x0]\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "ldr q31, [x17, #0x10]\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x5]\n"
+ "add x17, x17, #0x20\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "ldr q9, [x15, #0x10]\n"
+ "add x15, x15, #0x20\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x5]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x5]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x5]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x5]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x5]\n"
+ "add x5, x5, #0x8\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x14, x7]\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d20, [x13, x7]\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x12, x7]\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x11, x7]\n"
+ "add x7, x7, #0x8\n"
+ "beq 88f\n"
+ "add x6, x6, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v10.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v10.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v20.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v16.16b, v15.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d4, [x6, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d7, [x6, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "add x26, x26, x5\n"
+ "add x25, x25, x5\n"
+ "add x24, x24, x5\n"
+ "add x23, x23, x5\n"
+ "add x22, x22, x5\n"
+ "add x21, x21, x5\n"
+ "add x20, x20, x5\n"
+ "add x19, x19, x5\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v30.s }[0], [x25], #0x4\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v30.b }[6], [x25]\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v30.b }[4], [x25]\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v30.h }[0], [x25], #0x2\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v30.b }[2], [x25]\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v30.b }[0], [x25]\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x23, [x8, #0x40]\n"
+ "ssubl v31.8h, v31.8b, v12.8b\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "ssubl v30.8h, v30.8b, v12.8b\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "add x23, x23, x5\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v29.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v29.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[6], [x23]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[4], [x23]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v29.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[2], [x23]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[0], [x23]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ldr x22, [x8, #0x48]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x5\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ldr x21, [x8, #0x50]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x5\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "ldr x20, [x8, #0x58]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "add x20, x20, x5\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "ldr x19, [x8, #0x60]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "add x19, x19, x5\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v25.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v25.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v25.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "ldr x10, [x8, #0x68]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "add x10, x10, x5\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v29.s }[0], [x10], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v29.h }[2], [x10], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[6], [x10]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[4], [x10]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v29.h }[0], [x10], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[2], [x10]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[0], [x10]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr x9, [x8, #0x70]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "add x9, x9, x5\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr x28, [x8, #0x78]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "add x28, x28, x5\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v27.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v27.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[6], [x28]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[4], [x28]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v27.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[2], [x28]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[0], [x28]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr x27, [x8, #0x80]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "add x27, x27, x5\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[0], [x27]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr x26, [x8, #0x88]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "add x26, x26, x5\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[6], [x26]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[4], [x26]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[2], [x26]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[0], [x26]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr x25, [x8, #0x90]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "add x25, x25, x5\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[0], [x25]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "ldr x24, [x8, #0x98]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x5\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr x23, [x8, #0xa0]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "add x23, x23, x5\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr x22, [x8, #0xa8]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x5\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr x21, [x8, #0xb0]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "add x21, x21, x5\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr x20, [x8, #0xb8]\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "add x20, x20, x5\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr x19, [x8, #0xc0]\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "add x19, x19, x5\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[6], [x19]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[4], [x19]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[2], [x19]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[0], [x19]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v21.4s }, [x17], #0x10\n"
+ "ld1 { v30.4s }, [x15], #0x10\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v31.d }[0], [x17], #0x8\n"
+ "ld1 { v9.d }[0], [x15], #0x8\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v31.s }[2], [x17]\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v31.s }[0], [x17]\n"
+ "ld1 { v9.s }[0], [x15]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v21.d }[0], [x17], #0x8\n"
+ "ld1 { v30.d }[0], [x15], #0x8\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v21.s }[2], [x17]\n"
+ "ld1 { v30.s }[2], [x15]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v21.s }[0], [x17]\n"
+ "ld1 { v30.s }[0], [x15]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "add x14, x14, x7\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "add x13, x13, x7\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "add x12, x12, x7\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "add x11, x11, x7\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "tbz x4, #2, 85f\n"
+ "st1 { v15.s }[0], [x14], #0x4\n"
+ "st1 { v20.s }[0], [x13], #0x4\n"
+ "st1 { v16.s }[0], [x12], #0x4\n"
+ "st1 { v17.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "st1 { v15.h }[2], [x14], #0x2\n"
+ "st1 { v20.h }[2], [x13], #0x2\n"
+ "st1 { v16.h }[2], [x12], #0x2\n"
+ "st1 { v17.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[6], [x14], #0x1\n"
+ "st1 { v20.b }[6], [x13], #0x1\n"
+ "st1 { v16.b }[6], [x12], #0x1\n"
+ "st1 { v17.b }[6], [x11], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[4], [x14], #0x1\n"
+ "st1 { v20.b }[4], [x13], #0x1\n"
+ "st1 { v16.b }[4], [x12], #0x1\n"
+ "st1 { v17.b }[4], [x11], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "st1 { v15.h }[0], [x14], #0x2\n"
+ "st1 { v20.h }[0], [x13], #0x2\n"
+ "st1 { v16.h }[0], [x12], #0x2\n"
+ "st1 { v17.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[2], [x14], #0x1\n"
+ "st1 { v20.b }[2], [x13], #0x1\n"
+ "st1 { v16.b }[2], [x12], #0x1\n"
+ "st1 { v17.b }[2], [x11], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[0], [x14], #0x1\n"
+ "st1 { v20.b }[0], [x13], #0x1\n"
+ "st1 { v16.b }[0], [x12], #0x1\n"
+ "st1 { v17.b }[0], [x11], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+
+ "88:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..a998fa16d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ab64f53f66
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x10, #0x0\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x1, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x19, x4, #0x3\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v7.16b }, [x13]\n"
+ "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v19.4s }, [x8]\n"
+ "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x8]\n"
+ "ldp x17, x16, [x21, #0x0]\n"
+ "ldp x6, x8, [x21, #0x10]\n"
+ "cbz x19, 3f\n"
+ "subs x19, x19, #0x1\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v18.16b, v15.16b\n"
+ "ldr q20, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v11.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "ldr d27, [x24, x10]\n"
+ "ssubl v29.8h, v29.8b, v7.8b\n"
+ "ldr d23, [x23, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "ldr d22, [x0, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "subs x19, x19, #0x1\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr q21, [x5, #0x0]\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "ldr q17, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "ldr q14, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "add x3, x3, #0xc8\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x16, x1]\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x6, x1]\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d10, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v18.16b, v15.16b\n"
+ "ldr q20, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v11.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "ldr d27, [x24, x10]\n"
+ "ssubl v29.8h, v29.8b, v7.8b\n"
+ "ldr d23, [x23, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "ldr d22, [x0, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "tst x4, #0x7\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "ldr q21, [x5, #0x0]\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "ldr q17, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr q14, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x16, x1]\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x6, x1]\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d10, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "beq 124f\n"
+ "add x3, x3, #0xc8\n"
+ "3:" // Oddments
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v15.4s }, [x12], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v20.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v20.s }[2], [x12]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v20.s }[0], [x12]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v15.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[2], [x12]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[0], [x12]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v18.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v11.16b, v15.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "add x28, x28, x10\n"
+ "add x27, x27, x10\n"
+ "add x26, x26, x10\n"
+ "add x13, x13, x10\n"
+ "add x24, x24, x10\n"
+ "add x23, x23, x10\n"
+ "add x22, x22, x10\n"
+ "add x21, x21, x10\n"
+ "add x20, x20, x10\n"
+ "add x0, x0, x10\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v29.s }[0], [x26], #0x4\n"
+ "ld1 { v28.s }[0], [x13], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x13], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x13]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x13]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v29.h }[0], [x26], #0x2\n"
+ "ld1 { v28.h }[0], [x13], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x13]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v29.b }[0], [x26]\n"
+ "ld1 { v28.b }[0], [x13]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x20, [x25, #0x50]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ssubl v29.8h, v29.8b, v7.8b\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "add x28, x28, x10\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "add x0, x0, x10\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x0]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x0]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x0]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x0]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "add x7, x7, x10\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v25.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v25.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[6], [x7]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[4], [x7]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v25.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[2], [x7]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[0], [x7]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d1, [x3, #0x30]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "add x26, x26, x10\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d2, [x3, #0x38]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "add x23, x23, x10\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d3, [x3, #0x40]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d4, [x3, #0x48]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "add x22, x22, x10\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d0, [x3, #0x50]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "add x13, x13, x10\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v31.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v31.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[6], [x13]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[4], [x13]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v31.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[2], [x13]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[0], [x13]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "ldr x21, [x25, #0x98]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "add x21, x21, x10\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d1, [x3, #0x58]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x10\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d2, [x3, #0x60]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "add x11, x11, x10\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[6], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[4], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[0], [x11]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d3, [x3, #0x68]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "add x24, x24, x10\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[6], [x24]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[4], [x24]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[2], [x24]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[0], [x24]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d4, [x3, #0x70]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "add x0, x0, x10\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d0, [x3, #0x78]\n"
+ "ssubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "add x15, x15, x10\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v27.s }[0], [x15], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v27.h }[2], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[6], [x15]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[4], [x15]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v27.h }[0], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[2], [x15]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[0], [x15]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "ldr x9, [x25, #0xc8]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "add x9, x9, x10\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v23.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[6], [x9]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[4], [x9]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v23.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[2], [x9]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[0], [x9]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d1, [x3, #0x80]\n"
+ "ssubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "add x27, x27, x10\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d2, [x3, #0x88]\n"
+ "ssubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x10\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d3, [x3, #0x90]\n"
+ "ssubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "add x12, x12, x10\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 85f\n"
+ "ld1 { v28.s }[0], [x12], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "ld1 { v28.h }[2], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[6], [x12]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[4], [x12]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "ld1 { v28.h }[0], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[2], [x12]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[0], [x12]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d4, [x3, #0x98]\n"
+ "ssubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "add x7, x7, x10\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "tbz x4, #2, 89f\n"
+ "ld1 { v26.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 88f\n"
+ "ld1 { v26.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[6], [x7]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[4], [x7]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x4, #1, 90f\n"
+ "ld1 { v26.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[2], [x7]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[0], [x7]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d0, [x3, #0xa0]\n"
+ "ssubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "add x26, x26, x10\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "tbz x4, #2, 93f\n"
+ "ld1 { v25.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 92f\n"
+ "ld1 { v25.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[6], [x26]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[4], [x26]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x4, #1, 94f\n"
+ "ld1 { v25.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[2], [x26]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[0], [x26]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "ldr x23, [x25, #0xf8]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "add x23, x23, x10\n"
+ "tbz x4, #2, 97f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 96f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x4, #1, 98f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d1, [x3, #0xa8]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "add x22, x22, x10\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 101f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 100f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x4, #1, 102f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d2, [x3, #0xb0]\n"
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "tbz x4, #2, 105f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 104f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x4, #1, 106f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d3, [x3, #0xb8]\n"
+ "ssubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "add x13, x13, x10\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 109f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 108f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[6], [x13]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[4], [x13]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x4, #1, 110f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[2], [x13]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[0], [x13]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d4, [x3, #0xc0]\n"
+ "ssubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x10\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 113f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 112f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x4, #1, 114f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "ssubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "tbz x4, #2, 117f\n"
+ "ld1 { v6.4s }, [x2], #0x10\n"
+ "ld1 { v21.4s }, [x5], #0x10\n"
+ "tbz x4, #1, 116f\n"
+ "ld1 { v17.d }[0], [x2], #0x8\n"
+ "ld1 { v14.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[2], [x2]\n"
+ "ld1 { v14.s }[2], [x5]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[0], [x2]\n"
+ "ld1 { v14.s }[0], [x5]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 118f\n"
+ "ld1 { v6.d }[0], [x2], #0x8\n"
+ "ld1 { v21.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[2], [x2]\n"
+ "ld1 { v21.s }[2], [x5]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[0], [x2]\n"
+ "ld1 { v21.s }[0], [x5]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "add x17, x17, x1\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "add x16, x16, x1\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "add x6, x6, x1\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "add x8, x8, x1\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x4, #2, 121f\n"
+ "st1 { v15.s }[0], [x17], #0x4\n"
+ "st1 { v18.s }[0], [x16], #0x4\n"
+ "st1 { v11.s }[0], [x6], #0x4\n"
+ "st1 { v10.s }[0], [x8], #0x4\n"
+ "tbz x4, #1, 120f\n"
+ "st1 { v15.h }[2], [x17], #0x2\n"
+ "st1 { v18.h }[2], [x16], #0x2\n"
+ "st1 { v11.h }[2], [x6], #0x2\n"
+ "st1 { v10.h }[2], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[6], [x17], #0x1\n"
+ "st1 { v18.b }[6], [x16], #0x1\n"
+ "st1 { v11.b }[6], [x6], #0x1\n"
+ "st1 { v10.b }[6], [x8], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[4], [x17], #0x1\n"
+ "st1 { v18.b }[4], [x16], #0x1\n"
+ "st1 { v11.b }[4], [x6], #0x1\n"
+ "st1 { v10.b }[4], [x8], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 122f\n"
+ "st1 { v15.h }[0], [x17], #0x2\n"
+ "st1 { v18.h }[0], [x16], #0x2\n"
+ "st1 { v11.h }[0], [x6], #0x2\n"
+ "st1 { v10.h }[0], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[2], [x17], #0x1\n"
+ "st1 { v18.b }[2], [x16], #0x1\n"
+ "st1 { v11.b }[2], [x6], #0x1\n"
+ "st1 { v10.b }[2], [x8], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[0], [x17], #0x1\n"
+ "st1 { v18.b }[0], [x16], #0x1\n"
+ "st1 { v11.b }[0], [x6], #0x1\n"
+ "st1 { v10.b }[0], [x8], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+
+ "124:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..4e845cceaf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_s8q_nhwc_generic_output9_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ad5545a304
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v10.16b }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v9.16b }, [x20]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "mov x11, #0x0\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x2\n"
+ "cbz x10, 6f\n"
+ "1:" // Channel loop
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q27, [%x[bias], x19]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v24.16b, v27.16b\n"
+ "ldr s4, [x9, x11]\n"
+ "mov v23.16b, v27.16b\n"
+ "mov v22.16b, v27.16b\n"
+ "ldr s3, [x28, x11]\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "ldr s2, [x27, x11]\n"
+ "mov v19.16b, v27.16b\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "ssubl v4.8h, v4.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ssubl v3.8h, v3.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "ssubl v2.8h, v2.8b, v10.8b\n"
+ "ssubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ssubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "ssubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "ssubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "ssubl v29.8h, v29.8b, v10.8b\n"
+ "ssubl v28.8h, v28.8b, v10.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldr s4, [x9, x11]\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "ldr s3, [x28, x11]\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr s2, [x27, x11]\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "ssubl v4.8h, v4.8b, v10.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "ssubl v3.8h, v3.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ssubl v2.8h, v2.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ssubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "ssubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "ssubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "ssubl v29.8h, v29.8b, v10.8b\n"
+ "ssubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q6, [%x[rq_mul_ptr], x19]\n"
+ "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x27, x11]\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x26, x11]\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x24, x11]\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x23, x11]\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x22, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x21, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x20, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x19, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x10, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x19, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[2], [x19], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+
+ "9:" // Oddments: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "mov v24.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v23.16b, v27.16b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "mov v22.16b, v27.16b\n"
+ "add x28, x28, x11\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "add x27, x27, x11\n"
+ "mov v19.16b, v27.16b\n"
+ "ldr x21, [x20], #0x8\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "ssubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "ssubl v3.8h, v3.8b, v10.8b\n"
+ "ssubl v2.8h, v2.8b, v10.8b\n"
+ "ssubl v1.8h, v1.8b, v10.8b\n"
+ "ssubl v0.8h, v0.8b, v10.8b\n"
+ "ssubl v31.8h, v31.8b, v10.8b\n"
+ "ssubl v30.8h, v30.8b, v10.8b\n"
+ "ssubl v29.8h, v29.8b, v10.8b\n"
+ "ssubl v28.8h, v28.8b, v10.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "add x28, x28, x11\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "add x27, x27, x11\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr x21, [x20], #0x8\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "add x26, x26, x11\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "add x25, x25, x11\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "ssubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, x19, #0x1\n"
+ "ssubl v3.8h, v3.8b, v10.8b\n"
+ "ssubl v2.8h, v2.8b, v10.8b\n"
+ "ssubl v1.8h, v1.8b, v10.8b\n"
+ "ssubl v0.8h, v0.8b, v10.8b\n"
+ "ssubl v31.8h, v31.8b, v10.8b\n"
+ "ssubl v30.8h, v30.8b, v10.8b\n"
+ "ssubl v29.8h, v29.8b, v10.8b\n"
+ "ssubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v6.d }[0], [x21], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v7.d }[0], [x19], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[2], [x21], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v7.s }[2], [x19], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[0], [x21], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v7.s }[0], [x19], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "add x27, x27, x11\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "add x26, x26, x11\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x25, x25, x11\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "add x24, x24, x11\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "add x23, x23, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "add x22, x22, x11\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "add x21, x21, x11\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x20, x20, x11\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "add x19, x19, x11\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v27.h }[0], [x27], #0x2\n"
+ "st1 { v26.h }[0], [x26], #0x2\n"
+ "st1 { v25.h }[0], [x25], #0x2\n"
+ "st1 { v24.h }[0], [x24], #0x2\n"
+ "st1 { v23.h }[0], [x23], #0x2\n"
+ "st1 { v22.h }[0], [x22], #0x2\n"
+ "st1 { v21.h }[0], [x21], #0x2\n"
+ "st1 { v20.h }[0], [x20], #0x2\n"
+ "st1 { v19.h }[0], [x19], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[2], [x27], #0x1\n"
+ "st1 { v26.b }[2], [x26], #0x1\n"
+ "st1 { v25.b }[2], [x25], #0x1\n"
+ "st1 { v24.b }[2], [x24], #0x1\n"
+ "st1 { v23.b }[2], [x23], #0x1\n"
+ "st1 { v22.b }[2], [x22], #0x1\n"
+ "st1 { v21.b }[2], [x21], #0x1\n"
+ "st1 { v20.b }[2], [x20], #0x1\n"
+ "st1 { v19.b }[2], [x19], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[0], [x27], #0x1\n"
+ "st1 { v26.b }[0], [x26], #0x1\n"
+ "st1 { v25.b }[0], [x25], #0x1\n"
+ "st1 { v24.b }[0], [x24], #0x1\n"
+ "st1 { v23.b }[0], [x23], #0x1\n"
+ "st1 { v22.b }[0], [x22], #0x1\n"
+ "st1 { v21.b }[0], [x21], #0x1\n"
+ "st1 { v20.b }[0], [x20], #0x1\n"
+ "st1 { v19.b }[0], [x19], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+
+ "24:" // End
+
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..b9fef4f9ab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 9;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+ a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..2fb6d3538f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "movi v5.16b, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "ushr v5.4s, v5.4s, #0x8\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "mov x11, #0x0\n"
+ "movi v1.4s, #0x0\n"
+ "ld1 { v15.16b }, [x22]\n"
+ "mov x10, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "ld1 { v29.16b }, [x20]\n"
+ "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "movi v25.4s, #0x0\n"
+ "ld1 { v0.16b }, [x19]\n"
+ "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr x19, [%x[inptrs], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x2\n"
+ "ld1r { v4.4s }, [x21]\n"
+ "mov v17.16b, v15.16b\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ld1 { v7.16b }, [x19]\n"
+ "mov v23.16b, v15.16b\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x6\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "mov v18.16b, v29.16b\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "zip1 v15.4s, v15.4s, v17.4s\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+ "ld1r { v14.4s }, [x9]\n"
+ "zip1 v20.4s, v20.4s, v23.4s\n"
+ "ld1r { v27.4s }, [x28]\n"
+ "zip1 v15.4s, v15.4s, v20.4s\n"
+ "ld1r { v23.4s }, [x27]\n"
+ "mov v17.16b, v29.16b\n"
+ "ldr q6, [%x[params], #0x0]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "mov v11.16b, v29.16b\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip1 v29.4s, v29.4s, v17.4s\n"
+ "mov v12.16b, v0.16b\n"
+ "ext v12.16b, v12.16b, v12.16b, #0x2\n"
+ "zip1 v18.4s, v18.4s, v11.4s\n"
+ "zip1 v29.4s, v29.4s, v18.4s\n"
+ "mov v17.16b, v0.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "mov v11.16b, v0.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+ "mov v18.16b, v2.16b\n"
+ "zip1 v0.4s, v0.4s, v17.4s\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+ "zip1 v12.4s, v12.4s, v11.4s\n"
+ "zip1 v0.4s, v0.4s, v12.4s\n"
+ "mov v17.16b, v2.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "mov v19.16b, v2.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x6\n"
+ "mov v28.16b, v7.16b\n"
+ "zip1 v2.4s, v2.4s, v17.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "zip1 v18.4s, v18.4s, v19.4s\n"
+ "zip1 v2.4s, v2.4s, v18.4s\n"
+ "mov v18.16b, v7.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x4\n"
+ "mov v21.16b, v7.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x6\n"
+ "movi v30.4s, #0x0\n"
+ "zip1 v7.4s, v7.4s, v18.4s\n"
+ "movi v3.4s, #0x0\n"
+ "zip1 v28.4s, v28.4s, v21.4s\n"
+ "zip1 v7.4s, v7.4s, v28.4s\n"
+ "movi v12.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ ".inst 0x4f8fe0ba // sdot v26.4s, v5.16b, v15.4b[0]\n"
+ ".inst 0x4fafe0a1 // sdot v1.4s, v5.16b, v15.4b[1]\n"
+ ".inst 0x4f8fe8b6 // sdot v22.4s, v5.16b, v15.4b[2]\n"
+ ".inst 0x4fafe8b9 // sdot v25.4s, v5.16b, v15.4b[3]\n"
+ ".inst 0x4f9de0ad // sdot v13.4s, v5.16b, v29.4b[0]\n"
+ ".inst 0x4fbde0be // sdot v30.4s, v5.16b, v29.4b[1]\n"
+ ".inst 0x4f9de8a3 // sdot v3.4s, v5.16b, v29.4b[2]\n"
+ ".inst 0x4fbde8ac // sdot v12.4s, v5.16b, v29.4b[3]\n"
+ ".inst 0x4f80e0ab // sdot v11.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e0b3 // sdot v19.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4f80e8b5 // sdot v21.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4f82e0b0 // sdot v16.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bc // sdot v28.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4f82e8b2 // sdot v18.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8b4 // sdot v20.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4f87e0b8 // sdot v24.4s, v5.16b, v7.4b[0]\n"
+ ".inst 0x4fa7e0bf // sdot v31.4s, v5.16b, v7.4b[1]\n"
+ "mov v26.16b, v26.16b\n"
+ "mov v1.16b, v1.16b\n"
+ "mov v22.16b, v22.16b\n"
+ "mov v25.16b, v25.16b\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "movi v13.4s, #0x0\n"
+ ".inst 0x4f87e8ad // sdot v13.4s, v5.16b, v7.4b[2]\n"
+ "add v1.4s, v1.4s, v30.4s\n"
+ "movi v30.4s, #0x0\n"
+ ".inst 0x4fa7e8be // sdot v30.4s, v5.16b, v7.4b[3]\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v1.4s, v1.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v17.4s\n"
+ "mov v11.16b, v11.16b\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v19.16b, v21.16b\n"
+ "mov v21.16b, v17.16b\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add v3.4s, v3.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v18.4s\n"
+ "add v21.4s, v21.4s, v20.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v3.4s, v3.4s, v31.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v26.4s, v26.4s, v4.4s\n"
+ "str q26, [SP, #0x0]\n"
+ "mul v1.4s, v1.4s, v4.4s\n"
+ "mul v22.4s, v22.4s, v4.4s\n"
+ "str q1, [SP, #0x10]\n"
+ "mul v25.4s, v25.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "str q22, [SP, #0x20]\n"
+ "mul v3.4s, v3.4s, v4.4s\n"
+ "str q25, [SP, #0x30]\n"
+ "mul v19.4s, v19.4s, v4.4s\n"
+ "mul v21.4s, v21.4s, v4.4s\n"
+ "str q11, [SP, #0x40]\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "str q3, [SP, #0x50]\n"
+ "add v1.4s, v1.4s, v6.4s\n"
+ "str q19, [SP, #0x60]\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "str q21, [SP, #0x70]\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "add v3.4s, v3.4s, v6.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n"
+ "ldr q20, [%x[params], #0x0]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n"
+ ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n"
+ "ldr q8, [%x[params], #0x30]\n"
+ ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n"
+ ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n"
+ ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n"
+ ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n"
+ ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n"
+ ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n"
+ "ldr q10, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+ "and v30.16b, v26.16b, v4.16b\n"
+ "and v17.16b, v1.16b, v4.16b\n"
+ "and v16.16b, v22.16b, v4.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v4.16b\n"
+ "srshl v26.4s, v26.4s, v4.4s\n"
+ "srshl v1.4s, v1.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v1.4s, v1.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v23.4s\n"
+ "smin v1.4s, v1.4s, v23.4s\n"
+ "smin v22.4s, v22.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "smax v1.4s, v1.4s, v27.4s\n"
+ "smax v22.4s, v22.4s, v27.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x26, x10]\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q26, [SP, #0x0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "str s1, [x25, x10]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q1, [SP, #0x10]\n"
+ "and v16.16b, v11.16b, v4.16b\n"
+ "str s22, [x24, x10]\n"
+ "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+ "ldr q22, [SP, #0x20]\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "and v17.16b, v3.16b, v4.16b\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v23.4s\n"
+ "and v16.16b, v19.16b, v4.16b\n"
+ "srshl v11.4s, v11.4s, v4.4s\n"
+ "smax v25.4s, v25.4s, v27.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x23, x10]\n"
+ "smin v11.4s, v11.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v4.4s\n"
+ "ldr q25, [SP, #0x30]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v11.4s, v11.4s, v27.4s\n"
+ "add v3.4s, v3.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "smin v3.4s, v3.4s, v23.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str s11, [x22, x10]\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "ldr q11, [SP, #0x40]\n"
+ "and v16.16b, v21.16b, v4.16b\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v19.4s, v19.4s, v23.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str s3, [x21, x10]\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr q3, [SP, #0x50]\n"
+ "add v1.4s, v1.4s, v6.4s\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x10]\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "ldr q19, [SP, #0x60]\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "add v3.4s, v3.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x19, x10]\n"
+ "add x10, x10, #0x4\n"
+ "ldr q21, [SP, #0x70]\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n"
+ "ldr q20, [%x[params], #0x0]\n"
+ "add x26, x26, x10\n"
+ ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ "add x25, x25, x10\n"
+ ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n"
+ "add x24, x24, x10\n"
+ ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n"
+ "add x23, x23, x10\n"
+ ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n"
+ "add x22, x22, x10\n"
+ ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n"
+ "add x21, x21, x10\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x20, x20, x10\n"
+ ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n"
+ "add x19, x19, x10\n"
+ ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n"
+ "add %x[params], %x[params], #0x20\n"
+ ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n"
+ ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n"
+ ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n"
+ ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "and v30.16b, v26.16b, v4.16b\n"
+ "and v17.16b, v1.16b, v4.16b\n"
+ "and v16.16b, v22.16b, v4.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v4.16b\n"
+ "srshl v26.4s, v26.4s, v4.4s\n"
+ "srshl v1.4s, v1.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v1.4s, v1.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v23.4s\n"
+ "smin v1.4s, v1.4s, v23.4s\n"
+ "smin v22.4s, v22.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "smax v1.4s, v1.4s, v27.4s\n"
+ "smax v22.4s, v22.4s, v27.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "and v16.16b, v11.16b, v4.16b\n"
+ "and v17.16b, v3.16b, v4.16b\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v23.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "smax v25.4s, v25.4s, v27.4s\n"
+ "and v16.16b, v19.16b, v4.16b\n"
+ "srshl v11.4s, v11.4s, v4.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "srshl v3.4s, v3.4s, v4.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v3.4s, v3.4s, v14.4s\n"
+ "smin v11.4s, v11.4s, v23.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v3.4s, v3.4s, v23.4s\n"
+ "smax v11.4s, v11.4s, v27.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "and v16.16b, v21.16b, v4.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v19.4s, v19.4s, v23.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "blt 3f\n"
+ "str s26, [x26, #0x0]\n"
+ "str s1, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s11, [x22, #0x0]\n"
+ "str s3, [x21, #0x0]\n"
+ "str s19, [x20, #0x0]\n"
+ "str s21, [x19, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "st1 { v26.b }[0], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v25.b }[0], [x23], #0x1\n"
+ "st1 { v11.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "st1 { v21.b }[0], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[1], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[1], [x25], #0x1\n"
+ "st1 { v22.b }[1], [x24], #0x1\n"
+ "st1 { v25.b }[1], [x23], #0x1\n"
+ "st1 { v11.b }[1], [x22], #0x1\n"
+ "st1 { v3.b }[1], [x21], #0x1\n"
+ "st1 { v19.b }[1], [x20], #0x1\n"
+ "st1 { v21.b }[1], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[2], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v25.b }[2], [x23], #0x1\n"
+ "st1 { v11.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "st1 { v21.b }[2], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[3], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[3], [x25], #0x1\n"
+ "st1 { v22.b }[3], [x24], #0x1\n"
+ "st1 { v25.b }[3], [x23], #0x1\n"
+ "st1 { v11.b }[3], [x22], #0x1\n"
+ "st1 { v3.b }[3], [x21], #0x1\n"
+ "st1 { v19.b }[3], [x20], #0x1\n"
+ "st1 { v21.b }[3], [x19], #0x1\n"
+ "4:" // Tail: End
+ "add SP, SP, #0x80\n"
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..9a3eed47fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 8;
+ constexpr static unsigned int input_cols = 6;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+ a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..95ad78cf6c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "movi v15.16b, #0x1\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "movi v14.4s, #0x1\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "movi v28.4s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "mov x11, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "ld1 { v13.16b }, [x21]\n"
+ "mov x10, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "ld1 { v12.16b }, [x20]\n"
+ "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "movi v25.4s, #0x0\n"
+ "ld1 { v7.16b }, [x19]\n"
+ "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "mov v17.16b, v12.16b\n"
+ "ld1 { v6.16b }, [x21]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "mov v16.16b, v7.16b\n"
+ "ld1 { v4.16b }, [x19]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "zip1 v13.2d, v13.2d, v18.2d\n"
+ "ldr x19, [%x[inptrs], #0x38]\n"
+ "zip1 v12.2d, v12.2d, v17.2d\n"
+ "ld1r { v3.4s }, [x22]\n"
+ "mov v18.16b, v6.16b\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v16.2d\n"
+ "ld1 { v1.16b }, [x19]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v5.16b\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov v16.16b, v4.16b\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "zip1 v6.2d, v6.2d, v18.2d\n"
+ "ld1r { v0.4s }, [x9]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v31.4s }, [x28]\n"
+ "zip1 v5.2d, v5.2d, v17.2d\n"
+ "ld1r { v30.4s }, [x27]\n"
+ "mov v17.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0x0]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "zip1 v4.2d, v4.2d, v16.2d\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "mov v16.16b, v1.16b\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "add %x[params], %x[params], #0x50\n"
+ "zip1 v2.2d, v2.2d, v17.2d\n"
+ "movi v23.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "zip1 v1.2d, v1.2d, v16.2d\n"
+ "movi v21.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4f8de1fc // sdot v28.4s, v15.16b, v13.4b[0]\n"
+ ".inst 0x4f8de9fb // sdot v27.4s, v15.16b, v13.4b[2]\n"
+ ".inst 0x4f8ce1fa // sdot v26.4s, v15.16b, v12.4b[0]\n"
+ ".inst 0x4f8ce9f9 // sdot v25.4s, v15.16b, v12.4b[2]\n"
+ ".inst 0x4fade1dc // sdot v28.4s, v14.16b, v13.4b[1]\n"
+ ".inst 0x4fade9db // sdot v27.4s, v14.16b, v13.4b[3]\n"
+ ".inst 0x4face1da // sdot v26.4s, v14.16b, v12.4b[1]\n"
+ ".inst 0x4face9d9 // sdot v25.4s, v14.16b, v12.4b[3]\n"
+ ".inst 0x4f87e1f8 // sdot v24.4s, v15.16b, v7.4b[0]\n"
+ ".inst 0x4f87e9f7 // sdot v23.4s, v15.16b, v7.4b[2]\n"
+ ".inst 0x4f86e1f6 // sdot v22.4s, v15.16b, v6.4b[0]\n"
+ ".inst 0x4f86e9f5 // sdot v21.4s, v15.16b, v6.4b[2]\n"
+ ".inst 0x4fa7e1d8 // sdot v24.4s, v14.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e9d7 // sdot v23.4s, v14.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e1d6 // sdot v22.4s, v14.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e9d5 // sdot v21.4s, v14.16b, v6.4b[3]\n"
+ ".inst 0x4f85e1f2 // sdot v18.4s, v15.16b, v5.4b[0]\n"
+ ".inst 0x4f85e9f1 // sdot v17.4s, v15.16b, v5.4b[2]\n"
+ ".inst 0x4f84e1f0 // sdot v16.4s, v15.16b, v4.4b[0]\n"
+ ".inst 0x4f84e9f4 // sdot v20.4s, v15.16b, v4.4b[2]\n"
+ ".inst 0x4fa5e1d2 // sdot v18.4s, v14.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e9d1 // sdot v17.4s, v14.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e1d0 // sdot v16.4s, v14.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e9d4 // sdot v20.4s, v14.16b, v4.4b[3]\n"
+ ".inst 0x4f82e1f3 // sdot v19.4s, v15.16b, v2.4b[0]\n"
+ "mov v28.16b, v28.16b\n"
+ "mov v27.16b, v27.16b\n"
+ "add v28.4s, v28.4s, v26.4s\n"
+ ".inst 0x4fa2e1d3 // sdot v19.4s, v14.16b, v2.4b[1]\n"
+ "add v27.4s, v27.4s, v25.4s\n"
+ "add v28.4s, v28.4s, v24.4s\n"
+ "mov v26.16b, v26.16b\n"
+ "add v27.4s, v27.4s, v23.4s\n"
+ "add v28.4s, v28.4s, v22.4s\n"
+ "mov v25.16b, v25.16b\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v28.4s, v18.4s\n"
+ "add v26.4s, v26.4s, v24.4s\n"
+ "add v27.4s, v27.4s, v17.4s\n"
+ "add v25.4s, v25.4s, v23.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "mov v24.16b, v24.16b\n"
+ "add v25.4s, v25.4s, v21.4s\n"
+ "add v26.4s, v26.4s, v18.4s\n"
+ "mov v23.16b, v23.16b\n"
+ "add v25.4s, v25.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v18.4s\n"
+ "mov v22.16b, v22.16b\n"
+ "add v23.4s, v23.4s, v17.4s\n"
+ "add v24.4s, v24.4s, v16.4s\n"
+ "mov v21.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v18.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4f82e9f2 // sdot v18.4s, v15.16b, v2.4b[2]\n"
+ "add v21.4s, v21.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4f81e1f1 // sdot v17.4s, v15.16b, v1.4b[0]\n"
+ ".inst 0x4fa2e9d2 // sdot v18.4s, v14.16b, v2.4b[3]\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4fa1e1d1 // sdot v17.4s, v14.16b, v1.4b[1]\n"
+ ".inst 0x4f81e9f0 // sdot v16.4s, v15.16b, v1.4b[2]\n"
+ "add v23.4s, v23.4s, v18.4s\n"
+ "add v21.4s, v21.4s, v20.4s\n"
+ "add v22.4s, v22.4s, v19.4s\n"
+ ".inst 0x4fa1e9d0 // sdot v16.4s, v14.16b, v1.4b[3]\n"
+ "add v21.4s, v21.4s, v18.4s\n"
+ "add v22.4s, v22.4s, v17.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "mul v28.4s, v28.4s, v3.4s\n"
+ "str q28, [SP, #0x0]\n"
+ "mul v27.4s, v27.4s, v3.4s\n"
+ "mul v26.4s, v26.4s, v3.4s\n"
+ "str q27, [SP, #0x10]\n"
+ "mul v25.4s, v25.4s, v3.4s\n"
+ "mul v24.4s, v24.4s, v3.4s\n"
+ "str q26, [SP, #0x20]\n"
+ "mul v23.4s, v23.4s, v3.4s\n"
+ "str q25, [SP, #0x30]\n"
+ "mul v22.4s, v22.4s, v3.4s\n"
+ "mul v21.4s, v21.4s, v3.4s\n"
+ "str q24, [SP, #0x40]\n"
+ "add v28.4s, v28.4s, v29.4s\n"
+ "str q23, [SP, #0x50]\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "str q22, [SP, #0x60]\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q21, [SP, #0x70]\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n"
+ "ldr q19, [%x[params], #0x70]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n"
+ "ldr q29, [%x[params], #0x80]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n"
+ ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n"
+ ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n"
+ ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n"
+ ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n"
+ ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n"
+ "ldr q9, [%x[params], #0x10]\n"
+ ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n"
+ ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n"
+ ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%x[params], #0x20]\n"
+ ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n"
+ ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n"
+ ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n"
+ "ldr q11, [%x[params], #0x30]\n"
+ ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n"
+ "ldr q8, [%x[params], #0x40]\n"
+ ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n"
+ "ldr q9, [%x[params], #0x50]\n"
+ ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%x[params], #0xb0]\n"
+ ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n"
+ "ldr q9, [%x[params], #0xa0]\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+ "and v18.16b, v28.16b, v19.16b\n"
+ "and v17.16b, v27.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v19.16b\n"
+ "srshl v28.4s, v28.4s, v19.4s\n"
+ "srshl v27.4s, v27.4s, v19.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v30.4s\n"
+ "smin v27.4s, v27.4s, v30.4s\n"
+ "smin v26.4s, v26.4s, v30.4s\n"
+ "smax v28.4s, v28.4s, v31.4s\n"
+ "smax v27.4s, v27.4s, v31.4s\n"
+ "smax v26.4s, v26.4s, v31.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x26, x10]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ldr q28, [SP, #0x0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "str s27, [x25, x10]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ldr q27, [SP, #0x10]\n"
+ "and v16.16b, v24.16b, v19.16b\n"
+ "str s26, [x24, x10]\n"
+ "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+ "ldr q26, [SP, #0x20]\n"
+ "srshl v25.4s, v25.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v30.4s\n"
+ "and v16.16b, v22.16b, v19.16b\n"
+ "srshl v24.4s, v24.4s, v19.4s\n"
+ "smax v25.4s, v25.4s, v31.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x23, x10]\n"
+ "smin v24.4s, v24.4s, v30.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "ldr q25, [SP, #0x30]\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v24.4s, v24.4s, v31.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "smin v23.4s, v23.4s, v30.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x22, x10]\n"
+ "smax v23.4s, v23.4s, v31.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "ldr q24, [SP, #0x40]\n"
+ "and v16.16b, v21.16b, v19.16b\n"
+ "add v28.4s, v28.4s, v29.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v22.4s, v22.4s, v30.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x21, x10]\n"
+ "smax v22.4s, v22.4s, v31.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr q23, [SP, #0x50]\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x20, x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "ldr q22, [SP, #0x60]\n"
+ "srshl v21.4s, v21.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "smin v21.4s, v21.4s, v30.4s\n"
+ "smax v21.4s, v21.4s, v31.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x19, x10]\n"
+ "add x10, x10, #0x4\n"
+ "ldr q21, [SP, #0x70]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "add x26, x26, x10\n"
+ ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n"
+ "ldr q19, [%x[params], #0x70]\n"
+ "add x25, x25, x10\n"
+ ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n"
+ "add x24, x24, x10\n"
+ ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n"
+ "add x23, x23, x10\n"
+ ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n"
+ "add x22, x22, x10\n"
+ ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n"
+ "add x21, x21, x10\n"
+ ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n"
+ "add x20, x20, x10\n"
+ ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "add x19, x19, x10\n"
+ ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n"
+ ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n"
+ ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n"
+ ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n"
+ "ldr q9, [%x[params], #0x10]\n"
+ ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n"
+ ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n"
+ ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%x[params], #0x20]\n"
+ ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n"
+ ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n"
+ ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n"
+ "ldr q11, [%x[params], #0x30]\n"
+ ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n"
+ "ldr q8, [%x[params], #0x40]\n"
+ ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n"
+ "ldr q9, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x80\n"
+ ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "and v18.16b, v28.16b, v19.16b\n"
+ "and v17.16b, v27.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v19.16b\n"
+ "srshl v28.4s, v28.4s, v19.4s\n"
+ "srshl v27.4s, v27.4s, v19.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v30.4s\n"
+ "smin v27.4s, v27.4s, v30.4s\n"
+ "smin v26.4s, v26.4s, v30.4s\n"
+ "smax v28.4s, v28.4s, v31.4s\n"
+ "smax v27.4s, v27.4s, v31.4s\n"
+ "smax v26.4s, v26.4s, v31.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v19.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v16.16b, v24.16b, v19.16b\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "smax v25.4s, v25.4s, v31.4s\n"
+ "and v16.16b, v22.16b, v19.16b\n"
+ "srshl v24.4s, v24.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "smin v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v31.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v23.4s, v23.4s, v31.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "and v16.16b, v21.16b, v19.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v22.4s, v22.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v31.4s\n"
+ "srshl v21.4s, v21.4s, v19.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v30.4s\n"
+ "smax v21.4s, v21.4s, v31.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "blt 3f\n"
+ "str s28, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s26, [x24, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s22, [x20, #0x0]\n"
+ "str s21, [x19, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "st1 { v28.b }[0], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v25.b }[0], [x23], #0x1\n"
+ "st1 { v24.b }[0], [x22], #0x1\n"
+ "st1 { v23.b }[0], [x21], #0x1\n"
+ "st1 { v22.b }[0], [x20], #0x1\n"
+ "st1 { v21.b }[0], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[1], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[1], [x25], #0x1\n"
+ "st1 { v26.b }[1], [x24], #0x1\n"
+ "st1 { v25.b }[1], [x23], #0x1\n"
+ "st1 { v24.b }[1], [x22], #0x1\n"
+ "st1 { v23.b }[1], [x21], #0x1\n"
+ "st1 { v22.b }[1], [x20], #0x1\n"
+ "st1 { v21.b }[1], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[2], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v25.b }[2], [x23], #0x1\n"
+ "st1 { v24.b }[2], [x22], #0x1\n"
+ "st1 { v23.b }[2], [x21], #0x1\n"
+ "st1 { v22.b }[2], [x20], #0x1\n"
+ "st1 { v21.b }[2], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[3], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[3], [x25], #0x1\n"
+ "st1 { v26.b }[3], [x24], #0x1\n"
+ "st1 { v25.b }[3], [x23], #0x1\n"
+ "st1 { v24.b }[3], [x22], #0x1\n"
+ "st1 { v23.b }[3], [x21], #0x1\n"
+ "st1 { v22.b }[3], [x20], #0x1\n"
+ "st1 { v21.b }[3], [x19], #0x1\n"
+ "4:" // Tail: End
+ "add SP, SP, #0x80\n"
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d0ae00d260
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 2; };
+
+ kern_type kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c0acd8805e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const int8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x9, #0x0\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v11.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v10.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v9.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v7.4s }, [x19]\n"
+ "lsr x28, %x[n_output_channels], #0x2\n"
+ "cbz x28, 9f\n"
+ "1:" // Output channel loop
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q16, [%x[bias], x19]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q8, [%x[rq_mul_ptr], x19]\n"
+ "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 7f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x28, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x19, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[0], [x19]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x19], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 22f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.h }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.h }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.h }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.h }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.h }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.h }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.h }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.h }[0], [x25]\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v19.h }[0], [x26]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[2], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[2], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v19.b }[2], [x26]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[0], [x25]\n"
+ "st1 { v19.b }[0], [x26]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+
+ "26:" // Done
+
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..0fde00ba37
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..bdbda178b3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "lsr x28, %x[n_channels], #0x4\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "add x25, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ldp x24, x23, [%x[outptrs], #0x0]\n"
+ "add x22, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ldp x21, x20, [%x[outptrs], #0x10]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.4s }, [x25]\n"
+ "ld1r { v11.4s }, [x22]\n"
+ "ld1r { v10.4s }, [x19]\n"
+ "cbz x28, 2f\n"
+ "1:" // Loop
+ "ldr q27, [x15, x11]\n"
+ "subs x28, x28, #0x1\n"
+ "ldr q1, [x14, x11]\n"
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldr q25, [x13, x11]\n"
+ "zip1 v6.16b, v27.16b, v25.16b\n"
+ "ldr q23, [x12, x11]\n"
+ "zip2 v9.16b, v27.16b, v25.16b\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "ldr q31, [x10, x11]\n"
+ "zip1 v5.16b, v1.16b, v23.16b\n"
+ "ldr q28, [x9, x11]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "zip1 v8.16b, v6.16b, v5.16b\n"
+ "ldr q21, [x27, x11]\n"
+ "zip2 v7.16b, v6.16b, v5.16b\n"
+ "ldr q26, [x26, x11]\n"
+ "zip1 v6.16b, v9.16b, v3.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "zip2 v5.16b, v9.16b, v3.16b\n"
+ "ldr q24, [x15, x11]\n"
+ "ldr q22, [x14, x11]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "ldr q20, [x13, x11]\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "ldr q16, [x12, x11]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "ldr q19, [x10, x11]\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "ldr q0, [x9, x11]\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "ldr q18, [x27, x11]\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "ldr q17, [x26, x11]\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q6, [SP, #0x0]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ "ldr q8, [SP, #0x0]\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x70]\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ "ldr q3, [SP, #0x20]\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x80]\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ "ldr q31, [SP, #0x40]\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x90]\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "ldr q26, [SP, #0x60]\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xa0]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0xb0]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x24, x11]\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q30, [%x[params], #0x60]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x21, x11]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x11]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x11, x11, #0x4\n"
+ ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
+ "ldr q7, [SP, #0x10]\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0xd0]\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "ldr q2, [SP, #0x30]\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ "ldr q27, [%x[params], #0xe0]\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ "ldr q28, [SP, #0x50]\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "ldr q25, [%x[params], #0xf0]\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "ldr q24, [SP, #0x70]\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x110]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x24, x11]\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q30, [%x[params], #0xc0]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x21, x11]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x11]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x11, x11, #0x4\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x130]\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x140]\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x150]\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x170]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x24, x11]\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "add %x[params], %x[params], #0x180\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x21, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "mov v20.16b, v30.16b\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x11]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x11, x11, #0x4\n"
+ ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x24, x11]\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x21, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x11]\n"
+ "add x11, x11, #0x4\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0xf\n"
+ "beq 34f\n"
+ "2:" // Oddments
+ "and x19, %x[n_channels], #0xf\n"
+ "add x15, x15, x11\n"
+ "add x14, x14, x11\n"
+ "add x13, x13, x11\n"
+ "add x12, x12, x11\n"
+ "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "tbz %x[n_channels], #3, 6f\n"
+ "ld1 { v27.d }[0], [x15], #0x8\n"
+ "ld1 { v1.d }[0], [x14], #0x8\n"
+ "ld1 { v25.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
+ "ld1 { v31.d }[0], [x10], #0x8\n"
+ "ld1 { v28.d }[0], [x9], #0x8\n"
+ "ld1 { v21.d }[0], [x27], #0x8\n"
+ "ld1 { v26.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 4f\n"
+ "ld1 { v27.s }[2], [x15], #0x4\n"
+ "ld1 { v1.s }[2], [x14], #0x4\n"
+ "ld1 { v25.s }[2], [x13], #0x4\n"
+ "ld1 { v23.s }[2], [x12], #0x4\n"
+ "ld1 { v31.s }[2], [x10], #0x4\n"
+ "ld1 { v28.s }[2], [x9], #0x4\n"
+ "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 3f\n"
+ "ld1 { v27.h }[6], [x15], #0x2\n"
+ "ld1 { v1.h }[6], [x14], #0x2\n"
+ "ld1 { v25.h }[6], [x13], #0x2\n"
+ "ld1 { v23.h }[6], [x12], #0x2\n"
+ "ld1 { v31.h }[6], [x10], #0x2\n"
+ "ld1 { v28.h }[6], [x9], #0x2\n"
+ "ld1 { v21.h }[6], [x27], #0x2\n"
+ "ld1 { v26.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[14], [x15], #0x1\n"
+ "ld1 { v1.b }[14], [x14], #0x1\n"
+ "ld1 { v25.b }[14], [x13], #0x1\n"
+ "ld1 { v23.b }[14], [x12], #0x1\n"
+ "ld1 { v31.b }[14], [x10], #0x1\n"
+ "ld1 { v28.b }[14], [x9], #0x1\n"
+ "ld1 { v21.b }[14], [x27], #0x1\n"
+ "ld1 { v26.b }[14], [x26], #0x1\n"
+ "b 10f\n"
+ "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[12], [x15], #0x1\n"
+ "ld1 { v1.b }[12], [x14], #0x1\n"
+ "ld1 { v25.b }[12], [x13], #0x1\n"
+ "ld1 { v23.b }[12], [x12], #0x1\n"
+ "ld1 { v31.b }[12], [x10], #0x1\n"
+ "ld1 { v28.b }[12], [x9], #0x1\n"
+ "ld1 { v21.b }[12], [x27], #0x1\n"
+ "ld1 { v26.b }[12], [x26], #0x1\n"
+ "b 10f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v27.h }[4], [x15], #0x2\n"
+ "ld1 { v1.h }[4], [x14], #0x2\n"
+ "ld1 { v25.h }[4], [x13], #0x2\n"
+ "ld1 { v23.h }[4], [x12], #0x2\n"
+ "ld1 { v31.h }[4], [x10], #0x2\n"
+ "ld1 { v28.h }[4], [x9], #0x2\n"
+ "ld1 { v21.h }[4], [x27], #0x2\n"
+ "ld1 { v26.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[10], [x15], #0x1\n"
+ "ld1 { v1.b }[10], [x14], #0x1\n"
+ "ld1 { v25.b }[10], [x13], #0x1\n"
+ "ld1 { v23.b }[10], [x12], #0x1\n"
+ "ld1 { v31.b }[10], [x10], #0x1\n"
+ "ld1 { v28.b }[10], [x9], #0x1\n"
+ "ld1 { v21.b }[10], [x27], #0x1\n"
+ "ld1 { v26.b }[10], [x26], #0x1\n"
+ "b 10f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[8], [x15], #0x1\n"
+ "ld1 { v1.b }[8], [x14], #0x1\n"
+ "ld1 { v25.b }[8], [x13], #0x1\n"
+ "ld1 { v23.b }[8], [x12], #0x1\n"
+ "ld1 { v31.b }[8], [x10], #0x1\n"
+ "ld1 { v28.b }[8], [x9], #0x1\n"
+ "ld1 { v21.b }[8], [x27], #0x1\n"
+ "ld1 { v26.b }[8], [x26], #0x1\n"
+ "b 10f\n"
+ "6:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 8f\n"
+ "ld1 { v27.s }[0], [x15], #0x4\n"
+ "ld1 { v1.s }[0], [x14], #0x4\n"
+ "ld1 { v25.s }[0], [x13], #0x4\n"
+ "ld1 { v23.s }[0], [x12], #0x4\n"
+ "ld1 { v31.s }[0], [x10], #0x4\n"
+ "ld1 { v28.s }[0], [x9], #0x4\n"
+ "ld1 { v21.s }[0], [x27], #0x4\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.h }[2], [x15], #0x2\n"
+ "ld1 { v1.h }[2], [x14], #0x2\n"
+ "ld1 { v25.h }[2], [x13], #0x2\n"
+ "ld1 { v23.h }[2], [x12], #0x2\n"
+ "ld1 { v31.h }[2], [x10], #0x2\n"
+ "ld1 { v28.h }[2], [x9], #0x2\n"
+ "ld1 { v21.h }[2], [x27], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[6], [x15], #0x1\n"
+ "ld1 { v1.b }[6], [x14], #0x1\n"
+ "ld1 { v25.b }[6], [x13], #0x1\n"
+ "ld1 { v23.b }[6], [x12], #0x1\n"
+ "ld1 { v31.b }[6], [x10], #0x1\n"
+ "ld1 { v28.b }[6], [x9], #0x1\n"
+ "ld1 { v21.b }[6], [x27], #0x1\n"
+ "ld1 { v26.b }[6], [x26], #0x1\n"
+ "b 10f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[4], [x15], #0x1\n"
+ "ld1 { v1.b }[4], [x14], #0x1\n"
+ "ld1 { v25.b }[4], [x13], #0x1\n"
+ "ld1 { v23.b }[4], [x12], #0x1\n"
+ "ld1 { v31.b }[4], [x10], #0x1\n"
+ "ld1 { v28.b }[4], [x9], #0x1\n"
+ "ld1 { v21.b }[4], [x27], #0x1\n"
+ "ld1 { v26.b }[4], [x26], #0x1\n"
+ "b 10f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v27.h }[0], [x15], #0x2\n"
+ "ld1 { v1.h }[0], [x14], #0x2\n"
+ "ld1 { v25.h }[0], [x13], #0x2\n"
+ "ld1 { v23.h }[0], [x12], #0x2\n"
+ "ld1 { v31.h }[0], [x10], #0x2\n"
+ "ld1 { v28.h }[0], [x9], #0x2\n"
+ "ld1 { v21.h }[0], [x27], #0x2\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[2], [x15], #0x1\n"
+ "ld1 { v1.b }[2], [x14], #0x1\n"
+ "ld1 { v25.b }[2], [x13], #0x1\n"
+ "ld1 { v23.b }[2], [x12], #0x1\n"
+ "ld1 { v31.b }[2], [x10], #0x1\n"
+ "ld1 { v28.b }[2], [x9], #0x1\n"
+ "ld1 { v21.b }[2], [x27], #0x1\n"
+ "ld1 { v26.b }[2], [x26], #0x1\n"
+ "b 10f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[0], [x15], #0x1\n"
+ "ld1 { v1.b }[0], [x14], #0x1\n"
+ "ld1 { v25.b }[0], [x13], #0x1\n"
+ "ld1 { v23.b }[0], [x12], #0x1\n"
+ "ld1 { v31.b }[0], [x10], #0x1\n"
+ "ld1 { v28.b }[0], [x9], #0x1\n"
+ "ld1 { v21.b }[0], [x27], #0x1\n"
+ "ld1 { v26.b }[0], [x26], #0x1\n"
+ "10:" // Oddments: Load (A): Bit 3: End
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "add x15, x15, x11\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "add x14, x14, x11\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "add x13, x13, x11\n"
+ "add x12, x12, x11\n"
+ "add x10, x10, x11\n"
+ "add x9, x9, x11\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "tbz %x[n_channels], #3, 14f\n"
+ "ld1 { v24.d }[0], [x15], #0x8\n"
+ "ld1 { v22.d }[0], [x14], #0x8\n"
+ "ld1 { v20.d }[0], [x13], #0x8\n"
+ "ld1 { v16.d }[0], [x12], #0x8\n"
+ "ld1 { v19.d }[0], [x10], #0x8\n"
+ "ld1 { v0.d }[0], [x9], #0x8\n"
+ "ld1 { v18.d }[0], [x27], #0x8\n"
+ "ld1 { v17.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 12f\n"
+ "ld1 { v24.s }[2], [x15], #0x4\n"
+ "ld1 { v22.s }[2], [x14], #0x4\n"
+ "ld1 { v20.s }[2], [x13], #0x4\n"
+ "ld1 { v16.s }[2], [x12], #0x4\n"
+ "ld1 { v19.s }[2], [x10], #0x4\n"
+ "ld1 { v0.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ld1 { v24.h }[6], [x15], #0x2\n"
+ "ld1 { v22.h }[6], [x14], #0x2\n"
+ "ld1 { v20.h }[6], [x13], #0x2\n"
+ "ld1 { v16.h }[6], [x12], #0x2\n"
+ "ld1 { v19.h }[6], [x10], #0x2\n"
+ "ld1 { v0.h }[6], [x9], #0x2\n"
+ "ld1 { v18.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[14], [x15], #0x1\n"
+ "ld1 { v22.b }[14], [x14], #0x1\n"
+ "ld1 { v20.b }[14], [x13], #0x1\n"
+ "ld1 { v16.b }[14], [x12], #0x1\n"
+ "ld1 { v19.b }[14], [x10], #0x1\n"
+ "ld1 { v0.b }[14], [x9], #0x1\n"
+ "ld1 { v18.b }[14], [x27], #0x1\n"
+ "ld1 { v17.b }[14], [x26], #0x1\n"
+ "b 18f\n"
+ "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[12], [x15], #0x1\n"
+ "ld1 { v22.b }[12], [x14], #0x1\n"
+ "ld1 { v20.b }[12], [x13], #0x1\n"
+ "ld1 { v16.b }[12], [x12], #0x1\n"
+ "ld1 { v19.b }[12], [x10], #0x1\n"
+ "ld1 { v0.b }[12], [x9], #0x1\n"
+ "ld1 { v18.b }[12], [x27], #0x1\n"
+ "ld1 { v17.b }[12], [x26], #0x1\n"
+ "b 18f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v24.h }[4], [x15], #0x2\n"
+ "ld1 { v22.h }[4], [x14], #0x2\n"
+ "ld1 { v20.h }[4], [x13], #0x2\n"
+ "ld1 { v16.h }[4], [x12], #0x2\n"
+ "ld1 { v19.h }[4], [x10], #0x2\n"
+ "ld1 { v0.h }[4], [x9], #0x2\n"
+ "ld1 { v18.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[10], [x15], #0x1\n"
+ "ld1 { v22.b }[10], [x14], #0x1\n"
+ "ld1 { v20.b }[10], [x13], #0x1\n"
+ "ld1 { v16.b }[10], [x12], #0x1\n"
+ "ld1 { v19.b }[10], [x10], #0x1\n"
+ "ld1 { v0.b }[10], [x9], #0x1\n"
+ "ld1 { v18.b }[10], [x27], #0x1\n"
+ "ld1 { v17.b }[10], [x26], #0x1\n"
+ "b 18f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[8], [x15], #0x1\n"
+ "ld1 { v22.b }[8], [x14], #0x1\n"
+ "ld1 { v20.b }[8], [x13], #0x1\n"
+ "ld1 { v16.b }[8], [x12], #0x1\n"
+ "ld1 { v19.b }[8], [x10], #0x1\n"
+ "ld1 { v0.b }[8], [x9], #0x1\n"
+ "ld1 { v18.b }[8], [x27], #0x1\n"
+ "ld1 { v17.b }[8], [x26], #0x1\n"
+ "b 18f\n"
+ "14:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 16f\n"
+ "ld1 { v24.s }[0], [x15], #0x4\n"
+ "ld1 { v22.s }[0], [x14], #0x4\n"
+ "ld1 { v20.s }[0], [x13], #0x4\n"
+ "ld1 { v16.s }[0], [x12], #0x4\n"
+ "ld1 { v19.s }[0], [x10], #0x4\n"
+ "ld1 { v0.s }[0], [x9], #0x4\n"
+ "ld1 { v18.s }[0], [x27], #0x4\n"
+ "ld1 { v17.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ld1 { v24.h }[2], [x15], #0x2\n"
+ "ld1 { v22.h }[2], [x14], #0x2\n"
+ "ld1 { v20.h }[2], [x13], #0x2\n"
+ "ld1 { v16.h }[2], [x12], #0x2\n"
+ "ld1 { v19.h }[2], [x10], #0x2\n"
+ "ld1 { v0.h }[2], [x9], #0x2\n"
+ "ld1 { v18.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[6], [x15], #0x1\n"
+ "ld1 { v22.b }[6], [x14], #0x1\n"
+ "ld1 { v20.b }[6], [x13], #0x1\n"
+ "ld1 { v16.b }[6], [x12], #0x1\n"
+ "ld1 { v19.b }[6], [x10], #0x1\n"
+ "ld1 { v0.b }[6], [x9], #0x1\n"
+ "ld1 { v18.b }[6], [x27], #0x1\n"
+ "ld1 { v17.b }[6], [x26], #0x1\n"
+ "b 18f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[4], [x15], #0x1\n"
+ "ld1 { v22.b }[4], [x14], #0x1\n"
+ "ld1 { v20.b }[4], [x13], #0x1\n"
+ "ld1 { v16.b }[4], [x12], #0x1\n"
+ "ld1 { v19.b }[4], [x10], #0x1\n"
+ "ld1 { v0.b }[4], [x9], #0x1\n"
+ "ld1 { v18.b }[4], [x27], #0x1\n"
+ "ld1 { v17.b }[4], [x26], #0x1\n"
+ "b 18f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v24.h }[0], [x15], #0x2\n"
+ "ld1 { v22.h }[0], [x14], #0x2\n"
+ "ld1 { v20.h }[0], [x13], #0x2\n"
+ "ld1 { v16.h }[0], [x12], #0x2\n"
+ "ld1 { v19.h }[0], [x10], #0x2\n"
+ "ld1 { v0.h }[0], [x9], #0x2\n"
+ "ld1 { v18.h }[0], [x27], #0x2\n"
+ "ld1 { v17.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[2], [x15], #0x1\n"
+ "ld1 { v22.b }[2], [x14], #0x1\n"
+ "ld1 { v20.b }[2], [x13], #0x1\n"
+ "ld1 { v16.b }[2], [x12], #0x1\n"
+ "ld1 { v19.b }[2], [x10], #0x1\n"
+ "ld1 { v0.b }[2], [x9], #0x1\n"
+ "ld1 { v18.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "b 18f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[0], [x15], #0x1\n"
+ "ld1 { v22.b }[0], [x14], #0x1\n"
+ "ld1 { v20.b }[0], [x13], #0x1\n"
+ "ld1 { v16.b }[0], [x12], #0x1\n"
+ "ld1 { v19.b }[0], [x10], #0x1\n"
+ "ld1 { v0.b }[0], [x9], #0x1\n"
+ "ld1 { v18.b }[0], [x27], #0x1\n"
+ "ld1 { v17.b }[0], [x26], #0x1\n"
+ "18:" // Oddments: Load (B): Bit 3: End
+ "zip1 v6.16b, v27.16b, v25.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "cmp x19, #0x4\n"
+ "zip2 v9.16b, v27.16b, v25.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip1 v5.16b, v1.16b, v23.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "zip1 v8.16b, v6.16b, v5.16b\n"
+ "zip2 v7.16b, v6.16b, v5.16b\n"
+ "zip1 v6.16b, v9.16b, v3.16b\n"
+ "str q6, [SP, #0x0]\n"
+ "zip2 v5.16b, v9.16b, v3.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 19f\n"
+ "str s30, [x24, x11]\n"
+ "str s22, [x23, x11]\n"
+ "str s20, [x21, x11]\n"
+ "str s19, [x20, x11]\n"
+ "b 22f\n"
+ "19:" // Oddments: Unroll 0: Oddment store
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "tbz x19, #1, 20f\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v22.h }[0], [x23], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v22.b }[2], [x23], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "b 21f\n"
+ "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v22.b }[0], [x23], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+ "22:" // Oddments: Unroll 0: After oddment store
+ "add x11, x11, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "cmp x19, #0x4\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 23f\n"
+ "str s30, [x24, x11]\n"
+ "str s22, [x23, x11]\n"
+ "str s20, [x21, x11]\n"
+ "str s19, [x20, x11]\n"
+ "b 26f\n"
+ "23:" // Oddments: Unroll 1: Oddment store
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "tbz x19, #1, 24f\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v22.h }[0], [x23], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v22.b }[2], [x23], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "b 25f\n"
+ "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v22.b }[0], [x23], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+ "26:" // Oddments: Unroll 1: After oddment store
+ "add x11, x11, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "ldr q8, [SP, #0x0]\n"
+ "ldr q3, [SP, #0x20]\n"
+ "cmp x19, #0x4\n"
+ "ldr q31, [SP, #0x40]\n"
+ "ldr q26, [SP, #0x60]\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n"
+ ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n"
+ ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 27f\n"
+ "str s30, [x24, x11]\n"
+ "str s22, [x23, x11]\n"
+ "str s20, [x21, x11]\n"
+ "str s19, [x20, x11]\n"
+ "b 30f\n"
+ "27:" // Oddments: Unroll 2: Oddment store
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "tbz x19, #1, 28f\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v22.h }[0], [x23], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v22.b }[2], [x23], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "b 29f\n"
+ "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v22.b }[0], [x23], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+ "30:" // Oddments: Unroll 2: After oddment store
+ "add x11, x11, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "ldr q7, [SP, #0x10]\n"
+ "ldr q2, [SP, #0x30]\n"
+ "ldr q28, [SP, #0x50]\n"
+ "ldr q24, [SP, #0x70]\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n"
+ ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n"
+ "and v16.16b, v30.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "and v18.16b, v20.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "31:" // Oddments: Unroll 3: Oddment store
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "tbz x19, #1, 32f\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v22.h }[0], [x23], #0x2\n"
+ "st1 { v20.h }[0], [x21], #0x2\n"
+ "st1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v22.b }[2], [x23], #0x1\n"
+ "st1 { v20.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "b 33f\n"
+ "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v22.b }[0], [x23], #0x1\n"
+ "st1 { v20.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+ "34:" // End
+ "add SP, SP, #0x80\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..05eddd1853
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..22c584f8e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x13, x12, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "ldp x11, x10, [%x[inptrs], #0x10]\n"
+ "mov x19, #0x1\n"
+ "ldp x9, x28, [%x[inptrs], #0x20]\n"
+ "orr x19, x19, #0x100\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "orr x19, x19, #0x10000\n"
+ "dup v11.4s, w19\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "mov x23, #0x0\n"
+ "ldp x22, x21, [%x[outptrs], #0x10]\n"
+ "lsr x20, %x[n_channels], #0x4\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v9.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "cbz x20, 2f\n"
+ "1:" // Loop
+ "movi v15.4s, #0x0\n"
+ "ldr q27, [x13, x23]\n"
+ "subs x20, x20, #0x1\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q1, [x12, x23]\n"
+ "ldp x13, x12, [%x[inptrs], #0x40]\n"
+ "ldr q25, [x11, x23]\n"
+ "zip1 v7.16b, v27.16b, v25.16b\n"
+ "ldr q23, [x10, x23]\n"
+ "zip2 v5.16b, v27.16b, v25.16b\n"
+ "ldp x11, x10, [%x[inptrs], #0x50]\n"
+ "ldr q31, [x9, x23]\n"
+ "zip1 v8.16b, v1.16b, v23.16b\n"
+ "ldr q28, [x28, x23]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldp x9, x28, [%x[inptrs], #0x60]\n"
+ "zip1 v6.16b, v7.16b, v8.16b\n"
+ "ldr q21, [x27, x23]\n"
+ "zip2 v8.16b, v7.16b, v8.16b\n"
+ "ldr q26, [x26, x23]\n"
+ "zip1 v7.16b, v5.16b, v3.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "zip2 v5.16b, v5.16b, v3.16b\n"
+ "ldr q24, [x13, x23]\n"
+ "ldr q22, [x12, x23]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldp x13, x12, [%x[inptrs], #0x0]\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "ldr q20, [x11, x23]\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "ldr q16, [x10, x23]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "ldp x11, x10, [%x[inptrs], #0x10]\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "ldr q19, [x9, x23]\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "ldr q0, [x28, x23]\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "ldp x9, x28, [%x[inptrs], #0x20]\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "ldr q18, [x27, x23]\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "ldr q17, [x26, x23]\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x30]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q7, [SP, #0x0]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
+ ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
+ ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
+ ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
+ ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x70]\n"
+ ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
+ "ldr q3, [SP, #0x20]\n"
+ ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x80]\n"
+ ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
+ "ldr q31, [SP, #0x40]\n"
+ ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x90]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
+ "ldr q6, [SP, #0x0]\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "ldr q26, [SP, #0x60]\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "movi v15.4s, #0x0\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xa0]\n"
+ ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "mov v17.16b, v15.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "ldr q30, [%x[params], #0x60]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0xb0]\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "movi v10.4s, #0x0\n"
+ ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0xd0]\n"
+ ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
+ "ldr q2, [SP, #0x30]\n"
+ ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
+ "ldr q27, [%x[params], #0xe0]\n"
+ ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
+ "ldr q28, [SP, #0x50]\n"
+ ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
+ "ldr q25, [%x[params], #0xf0]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
+ "ldr q8, [SP, #0x10]\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "ldr q24, [SP, #0x70]\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "movi v15.4s, #0x0\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
+ "movi v10.4s, #0x0\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "mov v17.16b, v15.16b\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x110]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "ldr q30, [%x[params], #0xc0]\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "str s20, [x22, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
+ "ldr q29, [%x[params], #0x130]\n"
+ ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
+ "ldr q27, [%x[params], #0x140]\n"
+ ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
+ "ldr q25, [%x[params], #0x150]\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "movi v15.4s, #0x0\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "movi v10.4s, #0x0\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "mov v17.16b, v15.16b\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "ldr q21, [%x[params], #0x170]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "add %x[params], %x[params], #0x180\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "mov v19.16b, v30.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
+ ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x25, x23]\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x23]\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x22, x23]\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x21, x23]\n"
+ "add x23, x23, #0x4\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0xf\n"
+ "beq 34f\n"
+ "2:" // Oddments
+ "and x19, %x[n_channels], #0xf\n"
+ "add x13, x13, x23\n"
+ "add x12, x12, x23\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "add x9, x9, x23\n"
+ "add x28, x28, x23\n"
+ "add x27, x27, x23\n"
+ "add x26, x26, x23\n"
+ "tbz %x[n_channels], #3, 6f\n"
+ "ld1 { v27.d }[0], [x13], #0x8\n"
+ "ld1 { v1.d }[0], [x12], #0x8\n"
+ "ld1 { v25.d }[0], [x11], #0x8\n"
+ "ld1 { v23.d }[0], [x10], #0x8\n"
+ "ld1 { v31.d }[0], [x9], #0x8\n"
+ "ld1 { v28.d }[0], [x28], #0x8\n"
+ "ld1 { v21.d }[0], [x27], #0x8\n"
+ "ld1 { v26.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 4f\n"
+ "ld1 { v27.s }[2], [x13], #0x4\n"
+ "ld1 { v1.s }[2], [x12], #0x4\n"
+ "ld1 { v25.s }[2], [x11], #0x4\n"
+ "ld1 { v23.s }[2], [x10], #0x4\n"
+ "ld1 { v31.s }[2], [x9], #0x4\n"
+ "ld1 { v28.s }[2], [x28], #0x4\n"
+ "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v26.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 3f\n"
+ "ld1 { v27.h }[6], [x13], #0x2\n"
+ "ld1 { v1.h }[6], [x12], #0x2\n"
+ "ld1 { v25.h }[6], [x11], #0x2\n"
+ "ld1 { v23.h }[6], [x10], #0x2\n"
+ "ld1 { v31.h }[6], [x9], #0x2\n"
+ "ld1 { v28.h }[6], [x28], #0x2\n"
+ "ld1 { v21.h }[6], [x27], #0x2\n"
+ "ld1 { v26.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[14], [x13], #0x1\n"
+ "ld1 { v1.b }[14], [x12], #0x1\n"
+ "ld1 { v25.b }[14], [x11], #0x1\n"
+ "ld1 { v23.b }[14], [x10], #0x1\n"
+ "ld1 { v31.b }[14], [x9], #0x1\n"
+ "ld1 { v28.b }[14], [x28], #0x1\n"
+ "ld1 { v21.b }[14], [x27], #0x1\n"
+ "ld1 { v26.b }[14], [x26], #0x1\n"
+ "b 10f\n"
+ "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[12], [x13], #0x1\n"
+ "ld1 { v1.b }[12], [x12], #0x1\n"
+ "ld1 { v25.b }[12], [x11], #0x1\n"
+ "ld1 { v23.b }[12], [x10], #0x1\n"
+ "ld1 { v31.b }[12], [x9], #0x1\n"
+ "ld1 { v28.b }[12], [x28], #0x1\n"
+ "ld1 { v21.b }[12], [x27], #0x1\n"
+ "ld1 { v26.b }[12], [x26], #0x1\n"
+ "b 10f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v27.h }[4], [x13], #0x2\n"
+ "ld1 { v1.h }[4], [x12], #0x2\n"
+ "ld1 { v25.h }[4], [x11], #0x2\n"
+ "ld1 { v23.h }[4], [x10], #0x2\n"
+ "ld1 { v31.h }[4], [x9], #0x2\n"
+ "ld1 { v28.h }[4], [x28], #0x2\n"
+ "ld1 { v21.h }[4], [x27], #0x2\n"
+ "ld1 { v26.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[10], [x13], #0x1\n"
+ "ld1 { v1.b }[10], [x12], #0x1\n"
+ "ld1 { v25.b }[10], [x11], #0x1\n"
+ "ld1 { v23.b }[10], [x10], #0x1\n"
+ "ld1 { v31.b }[10], [x9], #0x1\n"
+ "ld1 { v28.b }[10], [x28], #0x1\n"
+ "ld1 { v21.b }[10], [x27], #0x1\n"
+ "ld1 { v26.b }[10], [x26], #0x1\n"
+ "b 10f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[8], [x13], #0x1\n"
+ "ld1 { v1.b }[8], [x12], #0x1\n"
+ "ld1 { v25.b }[8], [x11], #0x1\n"
+ "ld1 { v23.b }[8], [x10], #0x1\n"
+ "ld1 { v31.b }[8], [x9], #0x1\n"
+ "ld1 { v28.b }[8], [x28], #0x1\n"
+ "ld1 { v21.b }[8], [x27], #0x1\n"
+ "ld1 { v26.b }[8], [x26], #0x1\n"
+ "b 10f\n"
+ "6:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 8f\n"
+ "ld1 { v27.s }[0], [x13], #0x4\n"
+ "ld1 { v1.s }[0], [x12], #0x4\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "ld1 { v23.s }[0], [x10], #0x4\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v28.s }[0], [x28], #0x4\n"
+ "ld1 { v21.s }[0], [x27], #0x4\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.h }[2], [x13], #0x2\n"
+ "ld1 { v1.h }[2], [x12], #0x2\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "ld1 { v23.h }[2], [x10], #0x2\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v28.h }[2], [x28], #0x2\n"
+ "ld1 { v21.h }[2], [x27], #0x2\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[6], [x13], #0x1\n"
+ "ld1 { v1.b }[6], [x12], #0x1\n"
+ "ld1 { v25.b }[6], [x11], #0x1\n"
+ "ld1 { v23.b }[6], [x10], #0x1\n"
+ "ld1 { v31.b }[6], [x9], #0x1\n"
+ "ld1 { v28.b }[6], [x28], #0x1\n"
+ "ld1 { v21.b }[6], [x27], #0x1\n"
+ "ld1 { v26.b }[6], [x26], #0x1\n"
+ "b 10f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[4], [x13], #0x1\n"
+ "ld1 { v1.b }[4], [x12], #0x1\n"
+ "ld1 { v25.b }[4], [x11], #0x1\n"
+ "ld1 { v23.b }[4], [x10], #0x1\n"
+ "ld1 { v31.b }[4], [x9], #0x1\n"
+ "ld1 { v28.b }[4], [x28], #0x1\n"
+ "ld1 { v21.b }[4], [x27], #0x1\n"
+ "ld1 { v26.b }[4], [x26], #0x1\n"
+ "b 10f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v27.h }[0], [x13], #0x2\n"
+ "ld1 { v1.h }[0], [x12], #0x2\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "ld1 { v23.h }[0], [x10], #0x2\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v28.h }[0], [x28], #0x2\n"
+ "ld1 { v21.h }[0], [x27], #0x2\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[2], [x13], #0x1\n"
+ "ld1 { v1.b }[2], [x12], #0x1\n"
+ "ld1 { v25.b }[2], [x11], #0x1\n"
+ "ld1 { v23.b }[2], [x10], #0x1\n"
+ "ld1 { v31.b }[2], [x9], #0x1\n"
+ "ld1 { v28.b }[2], [x28], #0x1\n"
+ "ld1 { v21.b }[2], [x27], #0x1\n"
+ "ld1 { v26.b }[2], [x26], #0x1\n"
+ "b 10f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v27.b }[0], [x13], #0x1\n"
+ "ld1 { v1.b }[0], [x12], #0x1\n"
+ "ld1 { v25.b }[0], [x11], #0x1\n"
+ "ld1 { v23.b }[0], [x10], #0x1\n"
+ "ld1 { v31.b }[0], [x9], #0x1\n"
+ "ld1 { v28.b }[0], [x28], #0x1\n"
+ "ld1 { v21.b }[0], [x27], #0x1\n"
+ "ld1 { v26.b }[0], [x26], #0x1\n"
+ "10:" // Oddments: Load (A): Bit 3: End
+ "ldp x13, x12, [%x[inptrs], #0x40]\n"
+ "add x13, x13, x23\n"
+ "ldp x11, x10, [%x[inptrs], #0x50]\n"
+ "ldp x9, x28, [%x[inptrs], #0x60]\n"
+ "add x12, x12, x23\n"
+ "ldp x27, x26, [%x[inptrs], #0x70]\n"
+ "add x11, x11, x23\n"
+ "add x10, x10, x23\n"
+ "add x9, x9, x23\n"
+ "add x28, x28, x23\n"
+ "add x27, x27, x23\n"
+ "add x26, x26, x23\n"
+ "tbz %x[n_channels], #3, 14f\n"
+ "ld1 { v24.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "ld1 { v20.d }[0], [x11], #0x8\n"
+ "ld1 { v16.d }[0], [x10], #0x8\n"
+ "ld1 { v19.d }[0], [x9], #0x8\n"
+ "ld1 { v0.d }[0], [x28], #0x8\n"
+ "ld1 { v18.d }[0], [x27], #0x8\n"
+ "ld1 { v17.d }[0], [x26], #0x8\n"
+ "tbz %x[n_channels], #2, 12f\n"
+ "ld1 { v24.s }[2], [x13], #0x4\n"
+ "ld1 { v22.s }[2], [x12], #0x4\n"
+ "ld1 { v20.s }[2], [x11], #0x4\n"
+ "ld1 { v16.s }[2], [x10], #0x4\n"
+ "ld1 { v19.s }[2], [x9], #0x4\n"
+ "ld1 { v0.s }[2], [x28], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ld1 { v24.h }[6], [x13], #0x2\n"
+ "ld1 { v22.h }[6], [x12], #0x2\n"
+ "ld1 { v20.h }[6], [x11], #0x2\n"
+ "ld1 { v16.h }[6], [x10], #0x2\n"
+ "ld1 { v19.h }[6], [x9], #0x2\n"
+ "ld1 { v0.h }[6], [x28], #0x2\n"
+ "ld1 { v18.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[14], [x13], #0x1\n"
+ "ld1 { v22.b }[14], [x12], #0x1\n"
+ "ld1 { v20.b }[14], [x11], #0x1\n"
+ "ld1 { v16.b }[14], [x10], #0x1\n"
+ "ld1 { v19.b }[14], [x9], #0x1\n"
+ "ld1 { v0.b }[14], [x28], #0x1\n"
+ "ld1 { v18.b }[14], [x27], #0x1\n"
+ "ld1 { v17.b }[14], [x26], #0x1\n"
+ "b 18f\n"
+ "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[12], [x13], #0x1\n"
+ "ld1 { v22.b }[12], [x12], #0x1\n"
+ "ld1 { v20.b }[12], [x11], #0x1\n"
+ "ld1 { v16.b }[12], [x10], #0x1\n"
+ "ld1 { v19.b }[12], [x9], #0x1\n"
+ "ld1 { v0.b }[12], [x28], #0x1\n"
+ "ld1 { v18.b }[12], [x27], #0x1\n"
+ "ld1 { v17.b }[12], [x26], #0x1\n"
+ "b 18f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v24.h }[4], [x13], #0x2\n"
+ "ld1 { v22.h }[4], [x12], #0x2\n"
+ "ld1 { v20.h }[4], [x11], #0x2\n"
+ "ld1 { v16.h }[4], [x10], #0x2\n"
+ "ld1 { v19.h }[4], [x9], #0x2\n"
+ "ld1 { v0.h }[4], [x28], #0x2\n"
+ "ld1 { v18.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[10], [x13], #0x1\n"
+ "ld1 { v22.b }[10], [x12], #0x1\n"
+ "ld1 { v20.b }[10], [x11], #0x1\n"
+ "ld1 { v16.b }[10], [x10], #0x1\n"
+ "ld1 { v19.b }[10], [x9], #0x1\n"
+ "ld1 { v0.b }[10], [x28], #0x1\n"
+ "ld1 { v18.b }[10], [x27], #0x1\n"
+ "ld1 { v17.b }[10], [x26], #0x1\n"
+ "b 18f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[8], [x13], #0x1\n"
+ "ld1 { v22.b }[8], [x12], #0x1\n"
+ "ld1 { v20.b }[8], [x11], #0x1\n"
+ "ld1 { v16.b }[8], [x10], #0x1\n"
+ "ld1 { v19.b }[8], [x9], #0x1\n"
+ "ld1 { v0.b }[8], [x28], #0x1\n"
+ "ld1 { v18.b }[8], [x27], #0x1\n"
+ "ld1 { v17.b }[8], [x26], #0x1\n"
+ "b 18f\n"
+ "14:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 16f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "ld1 { v22.s }[0], [x12], #0x4\n"
+ "ld1 { v20.s }[0], [x11], #0x4\n"
+ "ld1 { v16.s }[0], [x10], #0x4\n"
+ "ld1 { v19.s }[0], [x9], #0x4\n"
+ "ld1 { v0.s }[0], [x28], #0x4\n"
+ "ld1 { v18.s }[0], [x27], #0x4\n"
+ "ld1 { v17.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "ld1 { v22.h }[2], [x12], #0x2\n"
+ "ld1 { v20.h }[2], [x11], #0x2\n"
+ "ld1 { v16.h }[2], [x10], #0x2\n"
+ "ld1 { v19.h }[2], [x9], #0x2\n"
+ "ld1 { v0.h }[2], [x28], #0x2\n"
+ "ld1 { v18.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[6], [x13], #0x1\n"
+ "ld1 { v22.b }[6], [x12], #0x1\n"
+ "ld1 { v20.b }[6], [x11], #0x1\n"
+ "ld1 { v16.b }[6], [x10], #0x1\n"
+ "ld1 { v19.b }[6], [x9], #0x1\n"
+ "ld1 { v0.b }[6], [x28], #0x1\n"
+ "ld1 { v18.b }[6], [x27], #0x1\n"
+ "ld1 { v17.b }[6], [x26], #0x1\n"
+ "b 18f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[4], [x13], #0x1\n"
+ "ld1 { v22.b }[4], [x12], #0x1\n"
+ "ld1 { v20.b }[4], [x11], #0x1\n"
+ "ld1 { v16.b }[4], [x10], #0x1\n"
+ "ld1 { v19.b }[4], [x9], #0x1\n"
+ "ld1 { v0.b }[4], [x28], #0x1\n"
+ "ld1 { v18.b }[4], [x27], #0x1\n"
+ "ld1 { v17.b }[4], [x26], #0x1\n"
+ "b 18f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "ld1 { v22.h }[0], [x12], #0x2\n"
+ "ld1 { v20.h }[0], [x11], #0x2\n"
+ "ld1 { v16.h }[0], [x10], #0x2\n"
+ "ld1 { v19.h }[0], [x9], #0x2\n"
+ "ld1 { v0.h }[0], [x28], #0x2\n"
+ "ld1 { v18.h }[0], [x27], #0x2\n"
+ "ld1 { v17.h }[0], [x26], #0x2\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[2], [x13], #0x1\n"
+ "ld1 { v22.b }[2], [x12], #0x1\n"
+ "ld1 { v20.b }[2], [x11], #0x1\n"
+ "ld1 { v16.b }[2], [x10], #0x1\n"
+ "ld1 { v19.b }[2], [x9], #0x1\n"
+ "ld1 { v0.b }[2], [x28], #0x1\n"
+ "ld1 { v18.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "b 18f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v24.b }[0], [x13], #0x1\n"
+ "ld1 { v22.b }[0], [x12], #0x1\n"
+ "ld1 { v20.b }[0], [x11], #0x1\n"
+ "ld1 { v16.b }[0], [x10], #0x1\n"
+ "ld1 { v19.b }[0], [x9], #0x1\n"
+ "ld1 { v0.b }[0], [x28], #0x1\n"
+ "ld1 { v18.b }[0], [x27], #0x1\n"
+ "ld1 { v17.b }[0], [x26], #0x1\n"
+ "18:" // Oddments: Load (B): Bit 3: End
+ "zip1 v7.16b, v27.16b, v25.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "cmp x19, #0x4\n"
+ "zip2 v5.16b, v27.16b, v25.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "zip1 v8.16b, v1.16b, v23.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "zip2 v3.16b, v1.16b, v23.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip1 v2.16b, v31.16b, v21.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "zip2 v4.16b, v31.16b, v21.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "zip1 v1.16b, v28.16b, v26.16b\n"
+ "zip2 v31.16b, v28.16b, v26.16b\n"
+ "zip1 v28.16b, v24.16b, v20.16b\n"
+ "zip2 v26.16b, v24.16b, v20.16b\n"
+ "zip1 v24.16b, v22.16b, v16.16b\n"
+ "zip2 v22.16b, v22.16b, v16.16b\n"
+ "zip1 v20.16b, v19.16b, v18.16b\n"
+ "zip2 v19.16b, v19.16b, v18.16b\n"
+ "zip1 v18.16b, v0.16b, v17.16b\n"
+ "zip2 v17.16b, v0.16b, v17.16b\n"
+ "zip1 v6.16b, v7.16b, v8.16b\n"
+ "zip2 v8.16b, v7.16b, v8.16b\n"
+ "zip1 v7.16b, v5.16b, v3.16b\n"
+ "str q7, [SP, #0x0]\n"
+ "zip2 v5.16b, v5.16b, v3.16b\n"
+ "str q5, [SP, #0x10]\n"
+ "zip1 v3.16b, v2.16b, v1.16b\n"
+ "zip2 v2.16b, v2.16b, v1.16b\n"
+ "zip1 v1.16b, v4.16b, v31.16b\n"
+ "str q1, [SP, #0x20]\n"
+ "zip2 v16.16b, v4.16b, v31.16b\n"
+ "str q16, [SP, #0x30]\n"
+ "zip1 v31.16b, v28.16b, v24.16b\n"
+ "zip2 v28.16b, v28.16b, v24.16b\n"
+ "zip1 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x40]\n"
+ "zip2 v16.16b, v26.16b, v22.16b\n"
+ "str q16, [SP, #0x50]\n"
+ "zip1 v26.16b, v20.16b, v18.16b\n"
+ "zip2 v24.16b, v20.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x60]\n"
+ "zip2 v16.16b, v19.16b, v17.16b\n"
+ "str q16, [SP, #0x70]\n"
+ "mov v22.16b, v30.16b\n"
+ "mov v20.16b, v30.16b\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
+ "movi v15.4s, #0x0\n"
+ ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
+ ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
+ ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
+ ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
+ "movi v10.4s, #0x0\n"
+ ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
+ ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
+ ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 19f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 22f\n"
+ "19:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 20f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 21f\n"
+ "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 21f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+ "22:" // Oddments: Unroll 0: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "cmp x19, #0x4\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mov v19.16b, v30.16b\n"
+ ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
+ ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
+ ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
+ ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
+ ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
+ ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 23f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 26f\n"
+ "23:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 24f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 25f\n"
+ "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 25f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+ "26:" // Oddments: Unroll 1: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q6, [SP, #0x0]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q3, [SP, #0x20]\n"
+ "cmp x19, #0x4\n"
+ ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n"
+ "ldr q31, [SP, #0x40]\n"
+ "ldr q26, [SP, #0x60]\n"
+ ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n"
+ ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n"
+ ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n"
+ ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n"
+ ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n"
+ ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n"
+ ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "blt 27f\n"
+ "str s30, [x25, x23]\n"
+ "str s22, [x24, x23]\n"
+ "str s20, [x22, x23]\n"
+ "str s19, [x21, x23]\n"
+ "b 30f\n"
+ "27:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 28f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 29f\n"
+ "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 29f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+ "30:" // Oddments: Unroll 2: After oddment store
+ "add x23, x23, #0x4\n"
+ "subs x19, x19, #0x4\n"
+ "ble 34f\n"
+ "movi v15.4s, #0x0\n"
+ "ldr q8, [SP, #0x10]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr q2, [SP, #0x30]\n"
+ "ldr q28, [SP, #0x50]\n"
+ ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n"
+ "ldr q24, [SP, #0x70]\n"
+ "ldr q30, [%x[params], #0x0]\n"
+ "mov v22.16b, v30.16b\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "mov v20.16b, v30.16b\n"
+ "ldr q27, [%x[params], #0x20]\n"
+ "mov v19.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n"
+ "ldr q23, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n"
+ "mov v17.16b, v15.16b\n"
+ ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n"
+ ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n"
+ "mls v30.4s, v15.4s, v14.4s\n"
+ ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ "mls v20.4s, v17.4s, v14.4s\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n"
+ ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n"
+ ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n"
+ ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n"
+ ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n"
+ "and v18.16b, v30.16b, v21.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n"
+ ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n"
+ "mov v17.16b, v10.16b\n"
+ ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n"
+ "mls v22.4s, v10.4s, v14.4s\n"
+ ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "mls v19.4s, v17.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "and v16.16b, v20.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v30.4s, v30.4s, v9.4s\n"
+ "and v17.16b, v22.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "and v16.16b, v19.16b, v21.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "srshl v22.4s, v22.4s, v21.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v9.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smax v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "31:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x23\n"
+ "add x24, x24, x23\n"
+ "add x22, x22, x23\n"
+ "add x21, x21, x23\n"
+ "tbz x19, #1, 32f\n"
+ "st1 { v30.h }[0], [x25], #0x2\n"
+ "st1 { v22.h }[0], [x24], #0x2\n"
+ "st1 { v20.h }[0], [x22], #0x2\n"
+ "st1 { v19.h }[0], [x21], #0x2\n"
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v20.b }[2], [x22], #0x1\n"
+ "st1 { v19.b }[2], [x21], #0x1\n"
+ "b 33f\n"
+ "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "tbz x19, #0, 33f\n"
+ "st1 { v30.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v20.b }[0], [x22], #0x1\n"
+ "st1 { v19.b }[0], [x21], #0x1\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+ "34:" // End
+ "add SP, SP, #0x80\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..09ba75f685
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..14e113b776
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x17, #0x0\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x12, x8, #0x3\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v14.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v9.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v24.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "cbz x12, 3f\n"
+ "subs x12, x12, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q19, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "ldr d4, [x16, #0x20]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ldr d7, [x16, #0x38]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "usubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "usubl v6.8h, v6.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "usubl v7.8h, v7.8b, v9.8b\n"
+ "usubl v8.8h, v8.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ldr d31, [x23, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "ldr d27, [x19, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "add x16, x16, #0x48\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "subs x12, x12, #0x1\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "ldr q10, [x11, #0x0]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "ldr q11, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x10, x15]\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x9, x15]\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x28, x15]\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q19, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "ldr d4, [x16, #0x20]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ldr d7, [x16, #0x38]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "usubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "usubl v6.8h, v6.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "usubl v7.8h, v7.8b, v9.8b\n"
+ "usubl v8.8h, v8.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ldr d31, [x23, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "ldr d27, [x19, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "tst x8, #0x7\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "ldr q10, [x11, #0x0]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "ldr q11, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "ldr q18, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x10, x15]\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x9, x15]\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x28, x15]\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "beq 64f\n"
+ "add x16, x16, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v13.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v19.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v19.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v19.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x8, #1, 6f\n"
+ "ld1 { v13.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v13.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v17.16b, v13.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v16.16b, v13.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "mov v21.16b, v19.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr d4, [x16, #0x20]\n"
+ "usubl v0.8h, v0.8b, v9.8b\n"
+ "mov v20.16b, v19.16b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "usubl v1.8h, v1.8b, v9.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr d7, [x16, #0x38]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "usubl v5.8h, v5.8b, v9.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "usubl v6.8h, v6.8b, v9.8b\n"
+ "usubl v7.8h, v7.8b, v9.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "usubl v8.8h, v8.8b, v9.8b\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "add x19, x19, x17\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x8, #1, 10f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x21, [x14, #0x28]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v4.4h\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v19.4s, v31.8h, v4.8h\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v17.4s, v31.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v25.4s, v31.8h, v3.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "add x21, x21, x17\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "smlal2 v19.4s, v30.8h, v0.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "smlal2 v19.4s, v28.8h, v5.8h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v31.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[6], [x21]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[4], [x21]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x8, #1, 14f\n"
+ "ld1 { v31.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[2], [x21]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[0], [x21]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v19.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v27.4h, v6.4h\n"
+ "add x20, x20, x17\n"
+ "smlal2 v25.4s, v27.8h, v6.8h\n"
+ "smlal v23.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v21.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x8, #1, 18f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr x26, [x14, #0x38]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x17\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x8, #1, 22f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "ldr x25, [x14, #0x40]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "add x25, x25, x17\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v31.s }[0], [x25], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v31.h }[2], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[6], [x25]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[4], [x25]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x8, #1, 26f\n"
+ "ld1 { v31.h }[0], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[2], [x25]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[0], [x25]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "ldr x19, [x14, #0x48]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
+ "smlal2 v19.4s, v31.8h, v2.8h\n"
+ "add x19, x19, x17\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v30.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v30.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x8, #1, 30f\n"
+ "ld1 { v30.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr x24, [x14, #0x50]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "add x24, x24, x17\n"
+ "smlal v17.4s, v30.4h, v7.4h\n"
+ "smlal2 v25.4s, v30.8h, v7.8h\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v21.4s, v30.8h, v5.8h\n"
+ "smlal v23.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "ldr x23, [x14, #0x58]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
+ "smlal2 v19.4s, v29.8h, v3.8h\n"
+ "add x23, x23, x17\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x8, #1, 38f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "ldr x22, [x14, #0x60]\n"
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v5.4h\n"
+ "smlal2 v25.4s, v28.8h, v5.8h\n"
+ "add x22, x22, x17\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x8, #1, 42f\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "ldr x21, [x14, #0x68]\n"
+ "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal2 v19.4s, v31.8h, v6.8h\n"
+ "add x21, x21, x17\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x8, #1, 46f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr x20, [x14, #0x70]\n"
+ "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal v17.4s, v30.4h, v8.4h\n"
+ "smlal2 v25.4s, v30.8h, v8.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v23.4s, v30.4h, v5.4h\n"
+ "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x8, #1, 50f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr x19, [x14, #0x78]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "add x19, x19, x17\n"
+ "smlal v23.4s, v29.4h, v6.4h\n"
+ "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v28.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[6], [x19]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[4], [x19]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x8, #1, 54f\n"
+ "ld1 { v28.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[2], [x19]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[0], [x19]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v21.4s, v28.8h, v8.8h\n"
+ "smlal v23.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v26.4s }, [x13], #0x10\n"
+ "ld1 { v10.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 56f\n"
+ "ld1 { v11.d }[0], [x13], #0x8\n"
+ "ld1 { v18.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v18.s }[2], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v11.s }[0], [x13]\n"
+ "ld1 { v18.s }[0], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x8, #1, 58f\n"
+ "ld1 { v26.d }[0], [x13], #0x8\n"
+ "ld1 { v10.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v26.s }[2], [x13]\n"
+ "ld1 { v10.s }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v26.s }[0], [x13]\n"
+ "ld1 { v10.s }[0], [x11]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "add x10, x10, x15\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "add x9, x9, x15\n"
+ "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+ "add x28, x28, x15\n"
+ "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+ "add x27, x27, x15\n"
+ "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+ "and v22.16b, v13.16b, v10.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v18.16b\n"
+ "and v3.16b, v17.16b, v10.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v6.16b, v25.16b, v18.16b\n"
+ "and v0.16b, v16.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "sqadd v17.4s, v17.4s, v3.4s\n"
+ "srshl v13.4s, v13.4s, v10.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
+ "srshl v17.4s, v17.4s, v10.4s\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "srshl v25.4s, v25.4s, v18.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "smax v13.4s, v13.4s, v24.4s\n"
+ "smin v17.4s, v17.4s, v12.4s\n"
+ "smax v19.4s, v19.4s, v24.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v24.4s\n"
+ "uzp1 v13.16b, v13.16b, v19.16b\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqadd v16.4s, v16.4s, v0.4s\n"
+ "smax v25.4s, v25.4s, v24.4s\n"
+ "and v29.16b, v21.16b, v18.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v25.16b\n"
+ "srshl v16.4s, v16.4s, v10.4s\n"
+ "and v3.16b, v23.16b, v10.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "add v16.4s, v16.4s, v15.4s\n"
+ "sqadd v21.4s, v21.4s, v29.4s\n"
+ "and v25.16b, v20.16b, v18.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smin v16.4s, v16.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v10.4s\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "srshl v20.4s, v20.4s, v18.4s\n"
+ "smax v21.4s, v21.4s, v24.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "smax v23.4s, v23.4s, v24.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v24.4s\n"
+ "uzp1 v23.16b, v23.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v13.s }[0], [x10], #0x4\n"
+ "st1 { v17.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v13.h }[2], [x10], #0x2\n"
+ "st1 { v17.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[6], [x10], #0x1\n"
+ "st1 { v17.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[4], [x10], #0x1\n"
+ "st1 { v17.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x8, #1, 62f\n"
+ "st1 { v13.h }[0], [x10], #0x2\n"
+ "st1 { v17.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[2], [x10], #0x1\n"
+ "st1 { v17.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v13.b }[0], [x10], #0x1\n"
+ "st1 { v17.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+
+ "64:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..44817dbccf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ccdde41973
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x5, #0x0\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x7, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x8, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x16, x4, #0x3\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v11.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "cbz x16, 3f\n"
+ "subs x16, x16, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v15.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "ldr d4, [x6, #0x20]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "ldr d7, [x6, #0x38]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "usubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "usubl v7.8h, v7.8b, v13.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "ldr d31, [x26, x5]\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "ldr d30, [x25, x5]\n"
+ "ldr d29, [x24, x5]\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "ldr d28, [x23, x5]\n"
+ "ldr d27, [x22, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr d26, [x21, x5]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "ldr d25, [x20, x5]\n"
+ "ldr d24, [x19, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x6, x6, #0x48\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "subs x16, x16, #0x1\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x8, #0x68]\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x8, #0x70]\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x8, #0x78]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x8, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x8, #0x88]\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x8, #0x90]\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x5]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x8, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x8, #0xa0]\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x8, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "ldr x21, [x8, #0xb0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x19, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "ldr x20, [x8, #0xb8]\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x8, #0xc0]\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "ldr q21, [x17, #0x0]\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "ldr d24, [x9, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "ldr q30, [x15, #0x0]\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "ldr q31, [x17, #0x10]\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x5]\n"
+ "add x17, x17, #0x20\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "ldr q9, [x15, #0x10]\n"
+ "add x15, x15, #0x20\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x5]\n"
+ "add x5, x5, #0x8\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x14, x7]\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d20, [x13, x7]\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x12, x7]\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x11, x7]\n"
+ "add x7, x7, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v16.16b, v15.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "ldr d4, [x6, #0x20]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "ldr d7, [x6, #0x38]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "usubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "usubl v7.8h, v7.8b, v13.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "ldr d31, [x26, x5]\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "ldr d30, [x25, x5]\n"
+ "ldr d29, [x24, x5]\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "ldr d28, [x23, x5]\n"
+ "ldr d27, [x22, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr d26, [x21, x5]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "ldr d25, [x20, x5]\n"
+ "ldr d24, [x19, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "ldr x23, [x8, #0x40]\n"
+ "tst x4, #0x7\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x8, #0x68]\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x8, #0x70]\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x8, #0x78]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x8, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x8, #0x88]\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x8, #0x90]\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x5]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x8, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x23, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x8, #0xa0]\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x8, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x20, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "ldr x21, [x8, #0xb0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x19, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "ldr x20, [x8, #0xb8]\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x8, #0xc0]\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "ldr q21, [x17, #0x0]\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "ldr d24, [x9, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "ldr q30, [x15, #0x0]\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "ldr q31, [x17, #0x10]\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x5]\n"
+ "add x17, x17, #0x20\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "ldr q9, [x15, #0x10]\n"
+ "add x15, x15, #0x20\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x5]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x5]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x5]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x5]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x5]\n"
+ "add x5, x5, #0x8\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x14, x7]\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d20, [x13, x7]\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x12, x7]\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d17, [x11, x7]\n"
+ "add x7, x7, #0x8\n"
+ "beq 88f\n"
+ "add x6, x6, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v10.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v10.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v20.16b, v15.16b\n"
+ "ldr d0, [x6, #0x0]\n"
+ "mov v23.16b, v10.16b\n"
+ "ldr d1, [x6, #0x8]\n"
+ "mov v16.16b, v15.16b\n"
+ "ldr d2, [x6, #0x10]\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr d3, [x6, #0x18]\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr d4, [x6, #0x20]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d5, [x6, #0x28]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr d6, [x6, #0x30]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr d7, [x6, #0x38]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldr d8, [x6, #0x40]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x26, x25, [x8, #0x0]\n"
+ "usubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x24, x23, [x8, #0x10]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v13.8b\n"
+ "ldp x22, x21, [x8, #0x20]\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldp x20, x19, [x8, #0x30]\n"
+ "add x26, x26, x5\n"
+ "add x25, x25, x5\n"
+ "add x24, x24, x5\n"
+ "add x23, x23, x5\n"
+ "add x22, x22, x5\n"
+ "add x21, x21, x5\n"
+ "add x20, x20, x5\n"
+ "add x19, x19, x5\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v30.s }[0], [x25], #0x4\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v30.b }[6], [x25]\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v30.b }[4], [x25]\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v30.h }[0], [x25], #0x2\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v30.b }[2], [x25]\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v30.b }[0], [x25]\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x23, [x8, #0x40]\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v31.4h, v6.4h\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal2 v23.4s, v31.8h, v6.8h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal2 v22.4s, v31.8h, v2.8h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v17.4s, v31.4h, v0.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "add x23, x23, x5\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v20.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal v20.4s, v27.4h, v2.4h\n"
+ "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v24.4h, v0.4h\n"
+ "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v29.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v29.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[6], [x23]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[4], [x23]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v29.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[2], [x23]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v29.b }[0], [x23]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ldr x22, [x8, #0x48]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v4.4h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "add x22, x22, x5\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ldr x21, [x8, #0x50]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v5.4h\n"
+ "smlal2 v23.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x5\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "ldr x20, [x8, #0x58]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "add x20, x20, x5\n"
+ "smlal v20.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "ldr x19, [x8, #0x60]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v16.4s, v26.4h, v3.4h\n"
+ "smlal2 v22.4s, v26.8h, v3.8h\n"
+ "add x19, x19, x5\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v25.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v25.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v25.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v25.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "ldr x10, [x8, #0x68]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "add x10, x10, x5\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v22.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v29.s }[0], [x10], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v29.h }[2], [x10], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[6], [x10]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[4], [x10]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v29.h }[0], [x10], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[2], [x10]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v29.b }[0], [x10]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr x9, [x8, #0x70]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v16.4s, v29.4h, v4.4h\n"
+ "smlal2 v22.4s, v29.8h, v4.8h\n"
+ "add x9, x9, x5\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr x28, [x8, #0x78]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "add x28, x28, x5\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v22.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v27.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v27.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[6], [x28]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[4], [x28]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v27.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[2], [x28]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v27.b }[0], [x28]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr x27, [x8, #0x80]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "add x27, x27, x5\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v28.b }[0], [x27]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr x26, [x8, #0x88]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "add x26, x26, x5\n"
+ "smlal v17.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[6], [x26]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[4], [x26]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[2], [x26]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v26.b }[0], [x26]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr x25, [x8, #0x90]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "add x25, x25, x5\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v25.b }[0], [x25]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "ldr x24, [x8, #0x98]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v25.4h, v6.4h\n"
+ "smlal2 v22.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x5\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr x23, [x8, #0xa0]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "add x23, x23, x5\n"
+ "smlal v17.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr x22, [x8, #0xa8]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v16.4s, v27.4h, v7.4h\n"
+ "smlal2 v22.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x5\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr x21, [x8, #0xb0]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v16.4s, v24.4h, v5.4h\n"
+ "smlal2 v22.4s, v24.8h, v5.8h\n"
+ "add x21, x21, x5\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr x20, [x8, #0xb8]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v17.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "add x20, x20, x5\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr x19, [x8, #0xc0]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v16.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "add x19, x19, x5\n"
+ "smlal v17.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[6], [x19]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[4], [x19]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[2], [x19]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v29.b }[0], [x19]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v21.4s }, [x17], #0x10\n"
+ "ld1 { v30.4s }, [x15], #0x10\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v31.d }[0], [x17], #0x8\n"
+ "ld1 { v9.d }[0], [x15], #0x8\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v31.s }[2], [x17]\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v31.s }[0], [x17]\n"
+ "ld1 { v9.s }[0], [x15]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v21.d }[0], [x17], #0x8\n"
+ "ld1 { v30.d }[0], [x15], #0x8\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v21.s }[2], [x17]\n"
+ "ld1 { v30.s }[2], [x15]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v21.s }[0], [x17]\n"
+ "ld1 { v30.s }[0], [x15]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+ "add x14, x14, x7\n"
+ "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+ "add x13, x13, x7\n"
+ "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+ "add x12, x12, x7\n"
+ "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+ "add x11, x11, x7\n"
+ "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+ "and v26.16b, v15.16b, v30.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v8.16b, v10.16b, v9.16b\n"
+ "and v4.16b, v20.16b, v30.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v2.16b, v23.16b, v9.16b\n"
+ "and v1.16b, v16.16b, v30.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v26.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "sqadd v10.4s, v10.4s, v8.4s\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "srshl v15.4s, v15.4s, v30.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "add v15.4s, v15.4s, v11.4s\n"
+ "srshl v23.4s, v23.4s, v9.4s\n"
+ "add v10.4s, v10.4s, v11.4s\n"
+ "smin v15.4s, v15.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "smax v20.4s, v20.4s, v19.4s\n"
+ "uzp1 v15.16b, v15.16b, v10.16b\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqadd v16.4s, v16.4s, v1.4s\n"
+ "smax v23.4s, v23.4s, v19.4s\n"
+ "and v24.16b, v22.16b, v9.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v23.16b\n"
+ "srshl v16.4s, v16.4s, v30.4s\n"
+ "and v2.16b, v17.16b, v30.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "sqadd v22.4s, v22.4s, v24.4s\n"
+ "and v31.16b, v18.16b, v9.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "srshl v22.4s, v22.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "smax v16.4s, v16.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqadd v18.4s, v18.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "srshl v18.4s, v18.4s, v9.4s\n"
+ "smax v22.4s, v22.4s, v19.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v22.16b\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "smax v17.4s, v17.4s, v19.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smax v18.4s, v18.4s, v19.4s\n"
+ "uzp1 v17.16b, v17.16b, v18.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "tbz x4, #2, 85f\n"
+ "st1 { v15.s }[0], [x14], #0x4\n"
+ "st1 { v20.s }[0], [x13], #0x4\n"
+ "st1 { v16.s }[0], [x12], #0x4\n"
+ "st1 { v17.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "st1 { v15.h }[2], [x14], #0x2\n"
+ "st1 { v20.h }[2], [x13], #0x2\n"
+ "st1 { v16.h }[2], [x12], #0x2\n"
+ "st1 { v17.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[6], [x14], #0x1\n"
+ "st1 { v20.b }[6], [x13], #0x1\n"
+ "st1 { v16.b }[6], [x12], #0x1\n"
+ "st1 { v17.b }[6], [x11], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[4], [x14], #0x1\n"
+ "st1 { v20.b }[4], [x13], #0x1\n"
+ "st1 { v16.b }[4], [x12], #0x1\n"
+ "st1 { v17.b }[4], [x11], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "st1 { v15.h }[0], [x14], #0x2\n"
+ "st1 { v20.h }[0], [x13], #0x2\n"
+ "st1 { v16.h }[0], [x12], #0x2\n"
+ "st1 { v17.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[2], [x14], #0x1\n"
+ "st1 { v20.b }[2], [x13], #0x1\n"
+ "st1 { v16.b }[2], [x12], #0x1\n"
+ "st1 { v17.b }[2], [x11], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "st1 { v15.b }[0], [x14], #0x1\n"
+ "st1 { v20.b }[0], [x13], #0x1\n"
+ "st1 { v16.b }[0], [x12], #0x1\n"
+ "st1 { v17.b }[0], [x11], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+
+ "88:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..73de9650c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..699cc6c80c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x10, #0x0\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x1, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x19, x4, #0x3\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v7.16b }, [x13]\n"
+ "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v19.4s }, [x8]\n"
+ "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "ld1r { v12.4s }, [x8]\n"
+ "ldp x17, x16, [x21, #0x0]\n"
+ "ldp x6, x8, [x21, #0x10]\n"
+ "cbz x19, 3f\n"
+ "subs x19, x19, #0x1\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v18.16b, v15.16b\n"
+ "ldr q20, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v11.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ldr d4, [x3, #0x20]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "ldr d27, [x24, x10]\n"
+ "usubl v29.8h, v29.8b, v7.8b\n"
+ "ldr d23, [x23, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "subs x19, x19, #0x1\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr q21, [x5, #0x0]\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "ldr q17, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "ldr q14, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "add x3, x3, #0xc8\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x16, x1]\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x6, x1]\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d10, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v18.16b, v15.16b\n"
+ "ldr q20, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v11.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "ldr d4, [x3, #0x20]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "ldr d27, [x24, x10]\n"
+ "usubl v29.8h, v29.8b, v7.8b\n"
+ "ldr d23, [x23, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "tst x4, #0x7\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "ldr q21, [x5, #0x0]\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "ldr q17, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr q14, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x16, x1]\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x6, x1]\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d10, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "beq 124f\n"
+ "add x3, x3, #0xc8\n"
+ "3:" // Oddments
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v15.4s }, [x12], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v20.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v20.s }[2], [x12]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v20.s }[0], [x12]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v15.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[2], [x12]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[0], [x12]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v18.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v11.16b, v15.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "mov v9.16b, v20.16b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "add x28, x28, x10\n"
+ "add x27, x27, x10\n"
+ "add x26, x26, x10\n"
+ "add x13, x13, x10\n"
+ "add x24, x24, x10\n"
+ "add x23, x23, x10\n"
+ "add x22, x22, x10\n"
+ "add x21, x21, x10\n"
+ "add x20, x20, x10\n"
+ "add x0, x0, x10\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v29.s }[0], [x26], #0x4\n"
+ "ld1 { v28.s }[0], [x13], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x13], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x13]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x13]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v29.h }[0], [x26], #0x2\n"
+ "ld1 { v28.h }[0], [x13], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x13]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v29.b }[0], [x26]\n"
+ "ld1 { v28.b }[0], [x13]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ldr x20, [x25, #0x50]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "usubl v29.8h, v29.8b, v7.8b\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v0.4h\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v18.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "add x28, x28, x10\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v9.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "add x0, x0, x10\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v9.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x0]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x0]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x0]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x0]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "add x7, x7, x10\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v20.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v25.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v25.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[6], [x7]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[4], [x7]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v25.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[2], [x7]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[0], [x7]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d1, [x3, #0x30]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v0.8h\n"
+ "add x26, x26, x10\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v18.4s, v23.4h, v1.4h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d2, [x3, #0x38]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v1.8h\n"
+ "add x23, x23, x10\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v20.4s, v23.8h, v2.8h\n"
+ "smlal v18.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d3, [x3, #0x40]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v27.8h, v2.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v20.4s, v31.8h, v3.8h\n"
+ "smlal v18.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d4, [x3, #0x48]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v10.4s, v23.4h, v3.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v23.8h, v3.8h\n"
+ "add x22, x22, x10\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d0, [x3, #0x50]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "add x13, x13, x10\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v20.4s, v22.8h, v0.8h\n"
+ "smlal v18.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v31.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v31.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[6], [x13]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[4], [x13]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v31.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[2], [x13]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[0], [x13]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "ldr x21, [x25, #0x98]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "add x21, x21, x10\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d1, [x3, #0x58]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v10.4s, v30.4h, v0.4h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v30.8h, v0.8h\n"
+ "add x14, x14, x10\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v20.4s, v25.8h, v1.8h\n"
+ "smlal v18.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d2, [x3, #0x60]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v10.4s, v26.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v1.8h\n"
+ "add x11, x11, x10\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v20.4s, v24.8h, v2.8h\n"
+ "smlal v18.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[6], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[4], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[0], [x11]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d3, [x3, #0x68]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "add x24, x24, x10\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal v18.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[6], [x24]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[4], [x24]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[2], [x24]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[0], [x24]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d4, [x3, #0x70]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "add x0, x0, x10\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v20.4s, v23.8h, v4.8h\n"
+ "smlal v18.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d0, [x3, #0x78]\n"
+ "usubl v22.8h, v22.8b, v7.8b\n"
+ "smlal v10.4s, v22.4h, v4.4h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v22.8h, v4.8h\n"
+ "add x15, x15, x10\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal v18.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v27.s }[0], [x15], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v27.h }[2], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[6], [x15]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[4], [x15]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v27.h }[0], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[2], [x15]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[0], [x15]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "ldr x9, [x25, #0xc8]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "add x9, x9, x10\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v23.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[6], [x9]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[4], [x9]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v23.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[2], [x9]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[0], [x9]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d1, [x3, #0x80]\n"
+ "usubl v23.8h, v23.8b, v7.8b\n"
+ "smlal v10.4s, v23.4h, v0.4h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v23.8h, v0.8h\n"
+ "add x27, x27, x10\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v20.4s, v30.8h, v1.8h\n"
+ "smlal v18.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d2, [x3, #0x88]\n"
+ "usubl v31.8h, v31.8b, v7.8b\n"
+ "smlal v10.4s, v31.4h, v1.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "add x28, x28, x10\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v20.4s, v26.8h, v2.8h\n"
+ "smlal v18.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d3, [x3, #0x90]\n"
+ "usubl v30.8h, v30.8b, v7.8b\n"
+ "smlal v10.4s, v30.4h, v2.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v30.8h, v2.8h\n"
+ "add x12, x12, x10\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v20.4s, v25.8h, v3.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 85f\n"
+ "ld1 { v28.s }[0], [x12], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "ld1 { v28.h }[2], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[6], [x12]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[4], [x12]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "ld1 { v28.h }[0], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[2], [x12]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[0], [x12]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d4, [x3, #0x98]\n"
+ "usubl v28.8h, v28.8b, v7.8b\n"
+ "smlal v10.4s, v28.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v28.8h, v3.8h\n"
+ "add x7, x7, x10\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v20.4s, v24.8h, v4.8h\n"
+ "smlal v18.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "tbz x4, #2, 89f\n"
+ "ld1 { v26.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 88f\n"
+ "ld1 { v26.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[6], [x7]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[4], [x7]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x4, #1, 90f\n"
+ "ld1 { v26.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[2], [x7]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[0], [x7]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d0, [x3, #0xa0]\n"
+ "usubl v26.8h, v26.8b, v7.8b\n"
+ "smlal v10.4s, v26.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v9.4s, v26.8h, v4.8h\n"
+ "add x26, x26, x10\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v20.4s, v27.8h, v0.8h\n"
+ "smlal v18.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "tbz x4, #2, 93f\n"
+ "ld1 { v25.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 92f\n"
+ "ld1 { v25.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[6], [x26]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[4], [x26]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x4, #1, 94f\n"
+ "ld1 { v25.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[2], [x26]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[0], [x26]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "ldr x23, [x25, #0xf8]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "add x23, x23, x10\n"
+ "tbz x4, #2, 97f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 96f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x4, #1, 98f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d1, [x3, #0xa8]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v0.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "add x22, x22, x10\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v20.4s, v23.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 101f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 100f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x4, #1, 102f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d2, [x3, #0xb0]\n"
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v1.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "add x20, x20, x10\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v20.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "tbz x4, #2, 105f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 104f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x4, #1, 106f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d3, [x3, #0xb8]\n"
+ "usubl v25.8h, v25.8b, v7.8b\n"
+ "smlal v10.4s, v25.4h, v2.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "add x13, x13, x10\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v20.4s, v30.8h, v3.8h\n"
+ "smlal v18.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 109f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 108f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[6], [x13]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[4], [x13]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x4, #1, 110f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[2], [x13]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[0], [x13]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d4, [x3, #0xc0]\n"
+ "usubl v24.8h, v24.8b, v7.8b\n"
+ "smlal v10.4s, v24.4h, v3.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x10\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v20.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 113f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 112f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x4, #1, 114f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "usubl v27.8h, v27.8b, v7.8b\n"
+ "smlal v10.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "tbz x4, #2, 117f\n"
+ "ld1 { v6.4s }, [x2], #0x10\n"
+ "ld1 { v21.4s }, [x5], #0x10\n"
+ "tbz x4, #1, 116f\n"
+ "ld1 { v17.d }[0], [x2], #0x8\n"
+ "ld1 { v14.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[2], [x2]\n"
+ "ld1 { v14.s }[2], [x5]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[0], [x2]\n"
+ "ld1 { v14.s }[0], [x5]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 118f\n"
+ "ld1 { v6.d }[0], [x2], #0x8\n"
+ "ld1 { v21.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[2], [x2]\n"
+ "ld1 { v21.s }[2], [x5]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[0], [x2]\n"
+ "ld1 { v21.s }[0], [x5]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "add x17, x17, x1\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "add x16, x16, x1\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "add x6, x6, x1\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "add x8, x8, x1\n"
+ "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+ "and v1.16b, v15.16b, v21.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v29.16b, v20.16b, v14.16b\n"
+ "and v3.16b, v18.16b, v21.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v2.16b, v5.16b, v14.16b\n"
+ "and v0.16b, v11.16b, v21.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v1.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "sqadd v18.4s, v18.4s, v3.4s\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "srshl v20.4s, v20.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
+ "add v15.4s, v15.4s, v19.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v19.4s\n"
+ "smin v15.4s, v15.4s, v12.4s\n"
+ "add v18.4s, v18.4s, v19.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "add v5.4s, v5.4s, v19.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v15.16b, v15.16b, v20.16b\n"
+ "smin v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "sqadd v11.4s, v11.4s, v0.4s\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "and v27.16b, v8.16b, v14.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v5.16b\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "and v30.16b, v10.16b, v21.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "add v11.4s, v11.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v27.4s\n"
+ "and v6.16b, v9.16b, v14.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "smin v11.4s, v11.4s, v12.4s\n"
+ "srshl v8.4s, v8.4s, v14.4s\n"
+ "sqadd v10.4s, v10.4s, v30.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "add v8.4s, v8.4s, v19.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "sqadd v9.4s, v9.4s, v6.4s\n"
+ "smin v8.4s, v8.4s, v12.4s\n"
+ "add v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v14.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "uzp1 v11.16b, v11.16b, v8.16b\n"
+ "add v9.4s, v9.4s, v19.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v12.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x4, #2, 121f\n"
+ "st1 { v15.s }[0], [x17], #0x4\n"
+ "st1 { v18.s }[0], [x16], #0x4\n"
+ "st1 { v11.s }[0], [x6], #0x4\n"
+ "st1 { v10.s }[0], [x8], #0x4\n"
+ "tbz x4, #1, 120f\n"
+ "st1 { v15.h }[2], [x17], #0x2\n"
+ "st1 { v18.h }[2], [x16], #0x2\n"
+ "st1 { v11.h }[2], [x6], #0x2\n"
+ "st1 { v10.h }[2], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[6], [x17], #0x1\n"
+ "st1 { v18.b }[6], [x16], #0x1\n"
+ "st1 { v11.b }[6], [x6], #0x1\n"
+ "st1 { v10.b }[6], [x8], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[4], [x17], #0x1\n"
+ "st1 { v18.b }[4], [x16], #0x1\n"
+ "st1 { v11.b }[4], [x6], #0x1\n"
+ "st1 { v10.b }[4], [x8], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 122f\n"
+ "st1 { v15.h }[0], [x17], #0x2\n"
+ "st1 { v18.h }[0], [x16], #0x2\n"
+ "st1 { v11.h }[0], [x6], #0x2\n"
+ "st1 { v10.h }[0], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[2], [x17], #0x1\n"
+ "st1 { v18.b }[2], [x16], #0x1\n"
+ "st1 { v11.b }[2], [x6], #0x1\n"
+ "st1 { v10.b }[2], [x8], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[0], [x17], #0x1\n"
+ "st1 { v18.b }[0], [x16], #0x1\n"
+ "st1 { v11.b }[0], [x6], #0x1\n"
+ "st1 { v10.b }[0], [x8], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+
+ "124:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f5459c2ac1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_u8q_nhwc_generic_output9_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..42d9b2f408
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v10.16b }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v9.16b }, [x20]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "mov x11, #0x0\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x2\n"
+ "cbz x10, 6f\n"
+ "1:" // Channel loop
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q27, [%x[bias], x19]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v24.16b, v27.16b\n"
+ "ldr s4, [x9, x11]\n"
+ "mov v23.16b, v27.16b\n"
+ "mov v22.16b, v27.16b\n"
+ "ldr s3, [x28, x11]\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "ldr s2, [x27, x11]\n"
+ "mov v19.16b, v27.16b\n"
+ "usubl v16.8h, v16.8b, v9.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldr s4, [x9, x11]\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "ldr s3, [x28, x11]\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr s2, [x27, x11]\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "usubl v16.8h, v16.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q6, [%x[rq_mul_ptr], x19]\n"
+ "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x27, x11]\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x26, x11]\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x24, x11]\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x23, x11]\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x22, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x21, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x20, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x19, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x10, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x19, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[2], [x19], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+
+ "9:" // Oddments: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "mov v24.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v23.16b, v27.16b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "mov v22.16b, v27.16b\n"
+ "add x28, x28, x11\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "add x27, x27, x11\n"
+ "mov v19.16b, v27.16b\n"
+ "ldr x21, [x20], #0x8\n"
+ "usubl v16.8h, v16.8b, v9.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "add x28, x28, x11\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "add x27, x27, x11\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr x21, [x20], #0x8\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "add x26, x26, x11\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "add x25, x25, x11\n"
+ "usubl v16.8h, v16.8b, v9.8b\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, x19, #0x1\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v6.d }[0], [x21], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v7.d }[0], [x19], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[2], [x21], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v7.s }[2], [x19], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[0], [x21], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v7.s }[0], [x19], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "add x27, x27, x11\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "add x26, x26, x11\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x25, x25, x11\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "add x24, x24, x11\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "add x23, x23, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "add x22, x22, x11\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "add x21, x21, x11\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x20, x20, x11\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "add x19, x19, x11\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v27.h }[0], [x27], #0x2\n"
+ "st1 { v26.h }[0], [x26], #0x2\n"
+ "st1 { v25.h }[0], [x25], #0x2\n"
+ "st1 { v24.h }[0], [x24], #0x2\n"
+ "st1 { v23.h }[0], [x23], #0x2\n"
+ "st1 { v22.h }[0], [x22], #0x2\n"
+ "st1 { v21.h }[0], [x21], #0x2\n"
+ "st1 { v20.h }[0], [x20], #0x2\n"
+ "st1 { v19.h }[0], [x19], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[2], [x27], #0x1\n"
+ "st1 { v26.b }[2], [x26], #0x1\n"
+ "st1 { v25.b }[2], [x25], #0x1\n"
+ "st1 { v24.b }[2], [x24], #0x1\n"
+ "st1 { v23.b }[2], [x23], #0x1\n"
+ "st1 { v22.b }[2], [x22], #0x1\n"
+ "st1 { v21.b }[2], [x21], #0x1\n"
+ "st1 { v20.b }[2], [x20], #0x1\n"
+ "st1 { v19.b }[2], [x19], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[0], [x27], #0x1\n"
+ "st1 { v26.b }[0], [x26], #0x1\n"
+ "st1 { v25.b }[0], [x25], #0x1\n"
+ "st1 { v24.b }[0], [x24], #0x1\n"
+ "st1 { v23.b }[0], [x23], #0x1\n"
+ "st1 { v22.b }[0], [x22], #0x1\n"
+ "st1 { v21.b }[0], [x21], #0x1\n"
+ "st1 { v20.b }[0], [x20], #0x1\n"
+ "st1 { v19.b }[0], [x19], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+
+ "24:" // End
+
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..e8ac603928
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 9;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+ a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..2106cf7086
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "movi v5.16b, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "ushr v5.4s, v5.4s, #0x8\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "mov x11, #0x0\n"
+ "movi v1.4s, #0x0\n"
+ "ld1 { v15.16b }, [x22]\n"
+ "mov x10, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "ld1 { v29.16b }, [x20]\n"
+ "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "movi v25.4s, #0x0\n"
+ "ld1 { v0.16b }, [x19]\n"
+ "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "mov v20.16b, v15.16b\n"
+ "ldr x19, [%x[inptrs], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x2\n"
+ "ld1r { v4.4s }, [x21]\n"
+ "mov v17.16b, v15.16b\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ld1 { v7.16b }, [x19]\n"
+ "mov v23.16b, v15.16b\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x6\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "mov v18.16b, v29.16b\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "zip1 v15.4s, v15.4s, v17.4s\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+ "ld1r { v14.4s }, [x9]\n"
+ "zip1 v20.4s, v20.4s, v23.4s\n"
+ "ld1r { v27.4s }, [x28]\n"
+ "zip1 v15.4s, v15.4s, v20.4s\n"
+ "ld1r { v23.4s }, [x27]\n"
+ "mov v17.16b, v29.16b\n"
+ "ldr q6, [%x[params], #0x0]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "mov v11.16b, v29.16b\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip1 v29.4s, v29.4s, v17.4s\n"
+ "mov v12.16b, v0.16b\n"
+ "ext v12.16b, v12.16b, v12.16b, #0x2\n"
+ "zip1 v18.4s, v18.4s, v11.4s\n"
+ "zip1 v29.4s, v29.4s, v18.4s\n"
+ "mov v17.16b, v0.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "mov v11.16b, v0.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+ "mov v18.16b, v2.16b\n"
+ "zip1 v0.4s, v0.4s, v17.4s\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+ "zip1 v12.4s, v12.4s, v11.4s\n"
+ "zip1 v0.4s, v0.4s, v12.4s\n"
+ "mov v17.16b, v2.16b\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "mov v19.16b, v2.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x6\n"
+ "mov v28.16b, v7.16b\n"
+ "zip1 v2.4s, v2.4s, v17.4s\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "zip1 v18.4s, v18.4s, v19.4s\n"
+ "zip1 v2.4s, v2.4s, v18.4s\n"
+ "mov v18.16b, v7.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x4\n"
+ "mov v21.16b, v7.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x6\n"
+ "movi v30.4s, #0x0\n"
+ "zip1 v7.4s, v7.4s, v18.4s\n"
+ "movi v3.4s, #0x0\n"
+ "zip1 v28.4s, v28.4s, v21.4s\n"
+ "zip1 v7.4s, v7.4s, v28.4s\n"
+ "movi v12.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ ".inst 0x6f8fe0ba // udot v26.4s, v5.16b, v15.4b[0]\n"
+ ".inst 0x6fafe0a1 // udot v1.4s, v5.16b, v15.4b[1]\n"
+ ".inst 0x6f8fe8b6 // udot v22.4s, v5.16b, v15.4b[2]\n"
+ ".inst 0x6fafe8b9 // udot v25.4s, v5.16b, v15.4b[3]\n"
+ ".inst 0x6f9de0ad // udot v13.4s, v5.16b, v29.4b[0]\n"
+ ".inst 0x6fbde0be // udot v30.4s, v5.16b, v29.4b[1]\n"
+ ".inst 0x6f9de8a3 // udot v3.4s, v5.16b, v29.4b[2]\n"
+ ".inst 0x6fbde8ac // udot v12.4s, v5.16b, v29.4b[3]\n"
+ ".inst 0x6f80e0ab // udot v11.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e0b3 // udot v19.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6f80e8b5 // udot v21.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6f82e0b0 // udot v16.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bc // udot v28.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6f82e8b2 // udot v18.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8b4 // udot v20.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6f87e0b8 // udot v24.4s, v5.16b, v7.4b[0]\n"
+ ".inst 0x6fa7e0bf // udot v31.4s, v5.16b, v7.4b[1]\n"
+ "mov v26.16b, v26.16b\n"
+ "mov v1.16b, v1.16b\n"
+ "mov v22.16b, v22.16b\n"
+ "mov v25.16b, v25.16b\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "movi v13.4s, #0x0\n"
+ ".inst 0x6f87e8ad // udot v13.4s, v5.16b, v7.4b[2]\n"
+ "add v1.4s, v1.4s, v30.4s\n"
+ "movi v30.4s, #0x0\n"
+ ".inst 0x6fa7e8be // udot v30.4s, v5.16b, v7.4b[3]\n"
+ "add v22.4s, v22.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v1.4s, v1.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v17.4s\n"
+ "mov v11.16b, v11.16b\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v19.16b, v21.16b\n"
+ "mov v21.16b, v17.16b\n"
+ "add v11.4s, v11.4s, v16.4s\n"
+ "add v3.4s, v3.4s, v28.4s\n"
+ "add v19.4s, v19.4s, v18.4s\n"
+ "add v21.4s, v21.4s, v20.4s\n"
+ "add v11.4s, v11.4s, v24.4s\n"
+ "add v3.4s, v3.4s, v31.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v30.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v26.4s, v26.4s, v4.4s\n"
+ "str q26, [SP, #0x0]\n"
+ "mul v1.4s, v1.4s, v4.4s\n"
+ "mul v22.4s, v22.4s, v4.4s\n"
+ "str q1, [SP, #0x10]\n"
+ "mul v25.4s, v25.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "str q22, [SP, #0x20]\n"
+ "mul v3.4s, v3.4s, v4.4s\n"
+ "str q25, [SP, #0x30]\n"
+ "mul v19.4s, v19.4s, v4.4s\n"
+ "mul v21.4s, v21.4s, v4.4s\n"
+ "str q11, [SP, #0x40]\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "str q3, [SP, #0x50]\n"
+ "add v1.4s, v1.4s, v6.4s\n"
+ "str q19, [SP, #0x60]\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "str q21, [SP, #0x70]\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "add v3.4s, v3.4s, v6.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n"
+ "ldr q20, [%x[params], #0x0]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n"
+ ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n"
+ "ldr q8, [%x[params], #0x30]\n"
+ ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n"
+ ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n"
+ ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n"
+ ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n"
+ ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n"
+ "ldr q9, [%x[params], #0x40]\n"
+ ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n"
+ ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n"
+ "ldr q10, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+ "and v30.16b, v26.16b, v4.16b\n"
+ "and v17.16b, v1.16b, v4.16b\n"
+ "and v16.16b, v22.16b, v4.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v4.16b\n"
+ "srshl v26.4s, v26.4s, v4.4s\n"
+ "srshl v1.4s, v1.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v1.4s, v1.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v23.4s\n"
+ "smin v1.4s, v1.4s, v23.4s\n"
+ "smin v22.4s, v22.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "smax v1.4s, v1.4s, v27.4s\n"
+ "smax v22.4s, v22.4s, v27.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x26, x10]\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q26, [SP, #0x0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "str s1, [x25, x10]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "ldr q1, [SP, #0x10]\n"
+ "and v16.16b, v11.16b, v4.16b\n"
+ "str s22, [x24, x10]\n"
+ "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+ "ldr q22, [SP, #0x20]\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "and v17.16b, v3.16b, v4.16b\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v23.4s\n"
+ "and v16.16b, v19.16b, v4.16b\n"
+ "srshl v11.4s, v11.4s, v4.4s\n"
+ "smax v25.4s, v25.4s, v27.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x23, x10]\n"
+ "smin v11.4s, v11.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v4.4s\n"
+ "ldr q25, [SP, #0x30]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v11.4s, v11.4s, v27.4s\n"
+ "add v3.4s, v3.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "smin v3.4s, v3.4s, v23.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str s11, [x22, x10]\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "ldr q11, [SP, #0x40]\n"
+ "and v16.16b, v21.16b, v4.16b\n"
+ "add v26.4s, v26.4s, v6.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v19.4s, v19.4s, v23.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str s3, [x21, x10]\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr q3, [SP, #0x50]\n"
+ "add v1.4s, v1.4s, v6.4s\n"
+ "add v22.4s, v22.4s, v6.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x20, x10]\n"
+ "add v25.4s, v25.4s, v6.4s\n"
+ "add v11.4s, v11.4s, v6.4s\n"
+ "ldr q19, [SP, #0x60]\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "add v3.4s, v3.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x19, x10]\n"
+ "add x10, x10, #0x4\n"
+ "ldr q21, [SP, #0x70]\n"
+ "add v21.4s, v21.4s, v6.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n"
+ "ldr q20, [%x[params], #0x0]\n"
+ "add x26, x26, x10\n"
+ ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ "add x25, x25, x10\n"
+ ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n"
+ "add x24, x24, x10\n"
+ ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n"
+ "add x23, x23, x10\n"
+ ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n"
+ "add x22, x22, x10\n"
+ ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n"
+ "add x21, x21, x10\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x20, x20, x10\n"
+ ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n"
+ "add x19, x19, x10\n"
+ ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n"
+ "add %x[params], %x[params], #0x20\n"
+ ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n"
+ ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n"
+ ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n"
+ ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "and v30.16b, v26.16b, v4.16b\n"
+ "and v17.16b, v1.16b, v4.16b\n"
+ "and v16.16b, v22.16b, v4.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v4.16b\n"
+ "srshl v26.4s, v26.4s, v4.4s\n"
+ "srshl v1.4s, v1.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v1.4s, v1.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v23.4s\n"
+ "smin v1.4s, v1.4s, v23.4s\n"
+ "smin v22.4s, v22.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "smax v1.4s, v1.4s, v27.4s\n"
+ "smax v22.4s, v22.4s, v27.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+ "and v16.16b, v11.16b, v4.16b\n"
+ "and v17.16b, v3.16b, v4.16b\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v23.4s\n"
+ "sqadd v11.4s, v11.4s, v16.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "smax v25.4s, v25.4s, v27.4s\n"
+ "and v16.16b, v19.16b, v4.16b\n"
+ "srshl v11.4s, v11.4s, v4.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "srshl v3.4s, v3.4s, v4.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v3.4s, v3.4s, v14.4s\n"
+ "smin v11.4s, v11.4s, v23.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v3.4s, v3.4s, v23.4s\n"
+ "smax v11.4s, v11.4s, v27.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "and v16.16b, v21.16b, v4.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v19.4s, v19.4s, v23.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v23.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "blt 3f\n"
+ "str s26, [x26, #0x0]\n"
+ "str s1, [x25, #0x0]\n"
+ "str s22, [x24, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s11, [x22, #0x0]\n"
+ "str s3, [x21, #0x0]\n"
+ "str s19, [x20, #0x0]\n"
+ "str s21, [x19, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "st1 { v26.b }[0], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[0], [x25], #0x1\n"
+ "st1 { v22.b }[0], [x24], #0x1\n"
+ "st1 { v25.b }[0], [x23], #0x1\n"
+ "st1 { v11.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x21], #0x1\n"
+ "st1 { v19.b }[0], [x20], #0x1\n"
+ "st1 { v21.b }[0], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[1], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[1], [x25], #0x1\n"
+ "st1 { v22.b }[1], [x24], #0x1\n"
+ "st1 { v25.b }[1], [x23], #0x1\n"
+ "st1 { v11.b }[1], [x22], #0x1\n"
+ "st1 { v3.b }[1], [x21], #0x1\n"
+ "st1 { v19.b }[1], [x20], #0x1\n"
+ "st1 { v21.b }[1], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[2], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[2], [x25], #0x1\n"
+ "st1 { v22.b }[2], [x24], #0x1\n"
+ "st1 { v25.b }[2], [x23], #0x1\n"
+ "st1 { v11.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x21], #0x1\n"
+ "st1 { v19.b }[2], [x20], #0x1\n"
+ "st1 { v21.b }[2], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v26.b }[3], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v1.b }[3], [x25], #0x1\n"
+ "st1 { v22.b }[3], [x24], #0x1\n"
+ "st1 { v25.b }[3], [x23], #0x1\n"
+ "st1 { v11.b }[3], [x22], #0x1\n"
+ "st1 { v3.b }[3], [x21], #0x1\n"
+ "st1 { v19.b }[3], [x20], #0x1\n"
+ "st1 { v21.b }[3], [x19], #0x1\n"
+ "4:" // Tail: End
+ "add SP, SP, #0x80\n"
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..c5e0417c20
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 8;
+ constexpr static unsigned int input_cols = 6;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+ a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8bcd682e3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "movi v15.16b, #0x1\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "add SP, SP, #-0x80\n"
+ "movi v14.4s, #0x1\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "movi v28.4s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "mov x11, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "ld1 { v13.16b }, [x21]\n"
+ "mov x10, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "ld1 { v12.16b }, [x20]\n"
+ "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "movi v25.4s, #0x0\n"
+ "ld1 { v7.16b }, [x19]\n"
+ "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "mov v17.16b, v12.16b\n"
+ "ld1 { v6.16b }, [x21]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "mov v16.16b, v7.16b\n"
+ "ld1 { v4.16b }, [x19]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "zip1 v13.2d, v13.2d, v18.2d\n"
+ "ldr x19, [%x[inptrs], #0x38]\n"
+ "zip1 v12.2d, v12.2d, v17.2d\n"
+ "ld1r { v3.4s }, [x22]\n"
+ "mov v18.16b, v6.16b\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v16.2d\n"
+ "ld1 { v1.16b }, [x19]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v5.16b\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov v16.16b, v4.16b\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "zip1 v6.2d, v6.2d, v18.2d\n"
+ "ld1r { v0.4s }, [x9]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ld1r { v31.4s }, [x28]\n"
+ "zip1 v5.2d, v5.2d, v17.2d\n"
+ "ld1r { v30.4s }, [x27]\n"
+ "mov v17.16b, v2.16b\n"
+ "ldr q29, [%x[params], #0x0]\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "zip1 v4.2d, v4.2d, v16.2d\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "mov v16.16b, v1.16b\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "add %x[params], %x[params], #0x50\n"
+ "zip1 v2.2d, v2.2d, v17.2d\n"
+ "movi v23.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "zip1 v1.2d, v1.2d, v16.2d\n"
+ "movi v21.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6f8de1fc // udot v28.4s, v15.16b, v13.4b[0]\n"
+ ".inst 0x6f8de9fb // udot v27.4s, v15.16b, v13.4b[2]\n"
+ ".inst 0x6f8ce1fa // udot v26.4s, v15.16b, v12.4b[0]\n"
+ ".inst 0x6f8ce9f9 // udot v25.4s, v15.16b, v12.4b[2]\n"
+ ".inst 0x6fade1dc // udot v28.4s, v14.16b, v13.4b[1]\n"
+ ".inst 0x6fade9db // udot v27.4s, v14.16b, v13.4b[3]\n"
+ ".inst 0x6face1da // udot v26.4s, v14.16b, v12.4b[1]\n"
+ ".inst 0x6face9d9 // udot v25.4s, v14.16b, v12.4b[3]\n"
+ ".inst 0x6f87e1f8 // udot v24.4s, v15.16b, v7.4b[0]\n"
+ ".inst 0x6f87e9f7 // udot v23.4s, v15.16b, v7.4b[2]\n"
+ ".inst 0x6f86e1f6 // udot v22.4s, v15.16b, v6.4b[0]\n"
+ ".inst 0x6f86e9f5 // udot v21.4s, v15.16b, v6.4b[2]\n"
+ ".inst 0x6fa7e1d8 // udot v24.4s, v14.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e9d7 // udot v23.4s, v14.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e1d6 // udot v22.4s, v14.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e9d5 // udot v21.4s, v14.16b, v6.4b[3]\n"
+ ".inst 0x6f85e1f2 // udot v18.4s, v15.16b, v5.4b[0]\n"
+ ".inst 0x6f85e9f1 // udot v17.4s, v15.16b, v5.4b[2]\n"
+ ".inst 0x6f84e1f0 // udot v16.4s, v15.16b, v4.4b[0]\n"
+ ".inst 0x6f84e9f4 // udot v20.4s, v15.16b, v4.4b[2]\n"
+ ".inst 0x6fa5e1d2 // udot v18.4s, v14.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e9d1 // udot v17.4s, v14.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e1d0 // udot v16.4s, v14.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e9d4 // udot v20.4s, v14.16b, v4.4b[3]\n"
+ ".inst 0x6f82e1f3 // udot v19.4s, v15.16b, v2.4b[0]\n"
+ "mov v28.16b, v28.16b\n"
+ "mov v27.16b, v27.16b\n"
+ "add v28.4s, v28.4s, v26.4s\n"
+ ".inst 0x6fa2e1d3 // udot v19.4s, v14.16b, v2.4b[1]\n"
+ "add v27.4s, v27.4s, v25.4s\n"
+ "add v28.4s, v28.4s, v24.4s\n"
+ "mov v26.16b, v26.16b\n"
+ "add v27.4s, v27.4s, v23.4s\n"
+ "add v28.4s, v28.4s, v22.4s\n"
+ "mov v25.16b, v25.16b\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v28.4s, v18.4s\n"
+ "add v26.4s, v26.4s, v24.4s\n"
+ "add v27.4s, v27.4s, v17.4s\n"
+ "add v25.4s, v25.4s, v23.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "mov v24.16b, v24.16b\n"
+ "add v25.4s, v25.4s, v21.4s\n"
+ "add v26.4s, v26.4s, v18.4s\n"
+ "mov v23.16b, v23.16b\n"
+ "add v25.4s, v25.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v18.4s\n"
+ "mov v22.16b, v22.16b\n"
+ "add v23.4s, v23.4s, v17.4s\n"
+ "add v24.4s, v24.4s, v16.4s\n"
+ "mov v21.16b, v21.16b\n"
+ "add v23.4s, v23.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v19.4s\n"
+ "add v22.4s, v22.4s, v18.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6f82e9f2 // udot v18.4s, v15.16b, v2.4b[2]\n"
+ "add v21.4s, v21.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6f81e1f1 // udot v17.4s, v15.16b, v1.4b[0]\n"
+ ".inst 0x6fa2e9d2 // udot v18.4s, v14.16b, v2.4b[3]\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6fa1e1d1 // udot v17.4s, v14.16b, v1.4b[1]\n"
+ ".inst 0x6f81e9f0 // udot v16.4s, v15.16b, v1.4b[2]\n"
+ "add v23.4s, v23.4s, v18.4s\n"
+ "add v21.4s, v21.4s, v20.4s\n"
+ "add v22.4s, v22.4s, v19.4s\n"
+ ".inst 0x6fa1e9d0 // udot v16.4s, v14.16b, v1.4b[3]\n"
+ "add v21.4s, v21.4s, v18.4s\n"
+ "add v22.4s, v22.4s, v17.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "mul v28.4s, v28.4s, v3.4s\n"
+ "str q28, [SP, #0x0]\n"
+ "mul v27.4s, v27.4s, v3.4s\n"
+ "mul v26.4s, v26.4s, v3.4s\n"
+ "str q27, [SP, #0x10]\n"
+ "mul v25.4s, v25.4s, v3.4s\n"
+ "mul v24.4s, v24.4s, v3.4s\n"
+ "str q26, [SP, #0x20]\n"
+ "mul v23.4s, v23.4s, v3.4s\n"
+ "str q25, [SP, #0x30]\n"
+ "mul v22.4s, v22.4s, v3.4s\n"
+ "mul v21.4s, v21.4s, v3.4s\n"
+ "str q24, [SP, #0x40]\n"
+ "add v28.4s, v28.4s, v29.4s\n"
+ "str q23, [SP, #0x50]\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "str q22, [SP, #0x60]\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q21, [SP, #0x70]\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "add x11, x11, #0x10\n"
+ ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n"
+ "ldr q19, [%x[params], #0x70]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n"
+ "ldr q29, [%x[params], #0x80]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n"
+ ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n"
+ ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n"
+ ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n"
+ ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n"
+ ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n"
+ "ldr q9, [%x[params], #0x10]\n"
+ ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n"
+ ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n"
+ ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%x[params], #0x20]\n"
+ ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n"
+ ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n"
+ ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n"
+ "ldr q11, [%x[params], #0x30]\n"
+ ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n"
+ "ldr q8, [%x[params], #0x40]\n"
+ ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n"
+ "ldr q9, [%x[params], #0x50]\n"
+ ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%x[params], #0xb0]\n"
+ ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n"
+ "ldr q9, [%x[params], #0xa0]\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+ "and v18.16b, v28.16b, v19.16b\n"
+ "and v17.16b, v27.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v19.16b\n"
+ "srshl v28.4s, v28.4s, v19.4s\n"
+ "srshl v27.4s, v27.4s, v19.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v30.4s\n"
+ "smin v27.4s, v27.4s, v30.4s\n"
+ "smin v26.4s, v26.4s, v30.4s\n"
+ "smax v28.4s, v28.4s, v31.4s\n"
+ "smax v27.4s, v27.4s, v31.4s\n"
+ "smax v26.4s, v26.4s, v31.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x26, x10]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ldr q28, [SP, #0x0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "str s27, [x25, x10]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ldr q27, [SP, #0x10]\n"
+ "and v16.16b, v24.16b, v19.16b\n"
+ "str s26, [x24, x10]\n"
+ "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+ "ldr q26, [SP, #0x20]\n"
+ "srshl v25.4s, v25.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v30.4s\n"
+ "and v16.16b, v22.16b, v19.16b\n"
+ "srshl v24.4s, v24.4s, v19.4s\n"
+ "smax v25.4s, v25.4s, v31.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x23, x10]\n"
+ "smin v24.4s, v24.4s, v30.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "ldr q25, [SP, #0x30]\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v24.4s, v24.4s, v31.4s\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "smin v23.4s, v23.4s, v30.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x22, x10]\n"
+ "smax v23.4s, v23.4s, v31.4s\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "ldr q24, [SP, #0x40]\n"
+ "and v16.16b, v21.16b, v19.16b\n"
+ "add v28.4s, v28.4s, v29.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v22.4s, v22.4s, v30.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x21, x10]\n"
+ "smax v22.4s, v22.4s, v31.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr q23, [SP, #0x50]\n"
+ "add v27.4s, v27.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x20, x10]\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "ldr q22, [SP, #0x60]\n"
+ "srshl v21.4s, v21.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "smin v21.4s, v21.4s, v30.4s\n"
+ "smax v21.4s, v21.4s, v31.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x19, x10]\n"
+ "add x10, x10, #0x4\n"
+ "ldr q21, [SP, #0x70]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "add x26, x26, x10\n"
+ ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n"
+ "ldr q19, [%x[params], #0x70]\n"
+ "add x25, x25, x10\n"
+ ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n"
+ "add x24, x24, x10\n"
+ ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n"
+ "add x23, x23, x10\n"
+ ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n"
+ "add x22, x22, x10\n"
+ ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n"
+ "add x21, x21, x10\n"
+ ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n"
+ "add x20, x20, x10\n"
+ ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ "add x19, x19, x10\n"
+ ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n"
+ ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n"
+ ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n"
+ ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n"
+ "ldr q9, [%x[params], #0x10]\n"
+ ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n"
+ ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n"
+ ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n"
+ ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n"
+ ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%x[params], #0x20]\n"
+ ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n"
+ ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n"
+ ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n"
+ "ldr q11, [%x[params], #0x30]\n"
+ ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n"
+ ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n"
+ ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n"
+ ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n"
+ "ldr q8, [%x[params], #0x40]\n"
+ ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n"
+ ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n"
+ ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n"
+ "ldr q9, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x80\n"
+ ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n"
+ ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n"
+ ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n"
+ ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n"
+ ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n"
+ ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n"
+ ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n"
+ ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n"
+ ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n"
+ ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n"
+ ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n"
+ ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n"
+ ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n"
+ ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+ "and v18.16b, v28.16b, v19.16b\n"
+ "and v17.16b, v27.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v17.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "and v16.16b, v25.16b, v19.16b\n"
+ "srshl v28.4s, v28.4s, v19.4s\n"
+ "srshl v27.4s, v27.4s, v19.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v27.4s, v27.4s, v0.4s\n"
+ "add v26.4s, v26.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v30.4s\n"
+ "smin v27.4s, v27.4s, v30.4s\n"
+ "smin v26.4s, v26.4s, v30.4s\n"
+ "smax v28.4s, v28.4s, v31.4s\n"
+ "smax v27.4s, v27.4s, v31.4s\n"
+ "smax v26.4s, v26.4s, v31.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v19.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v16.16b, v24.16b, v19.16b\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "add v25.4s, v25.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smin v25.4s, v25.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "smax v25.4s, v25.4s, v31.4s\n"
+ "and v16.16b, v22.16b, v19.16b\n"
+ "srshl v24.4s, v24.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v23.4s, v23.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "smin v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v31.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "smax v23.4s, v23.4s, v31.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "srshl v22.4s, v22.4s, v19.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "and v16.16b, v21.16b, v19.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "add v22.4s, v22.4s, v0.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v22.4s, v22.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v31.4s\n"
+ "srshl v21.4s, v21.4s, v19.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "add v21.4s, v21.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v30.4s\n"
+ "smax v21.4s, v21.4s, v31.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "blt 3f\n"
+ "str s28, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s26, [x24, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s22, [x20, #0x0]\n"
+ "str s21, [x19, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "st1 { v28.b }[0], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v25.b }[0], [x23], #0x1\n"
+ "st1 { v24.b }[0], [x22], #0x1\n"
+ "st1 { v23.b }[0], [x21], #0x1\n"
+ "st1 { v22.b }[0], [x20], #0x1\n"
+ "st1 { v21.b }[0], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[1], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[1], [x25], #0x1\n"
+ "st1 { v26.b }[1], [x24], #0x1\n"
+ "st1 { v25.b }[1], [x23], #0x1\n"
+ "st1 { v24.b }[1], [x22], #0x1\n"
+ "st1 { v23.b }[1], [x21], #0x1\n"
+ "st1 { v22.b }[1], [x20], #0x1\n"
+ "st1 { v21.b }[1], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[2], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v25.b }[2], [x23], #0x1\n"
+ "st1 { v24.b }[2], [x22], #0x1\n"
+ "st1 { v23.b }[2], [x21], #0x1\n"
+ "st1 { v22.b }[2], [x20], #0x1\n"
+ "st1 { v21.b }[2], [x19], #0x1\n"
+ "beq 4f\n"
+ "st1 { v28.b }[3], [x26], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v27.b }[3], [x25], #0x1\n"
+ "st1 { v26.b }[3], [x24], #0x1\n"
+ "st1 { v25.b }[3], [x23], #0x1\n"
+ "st1 { v24.b }[3], [x22], #0x1\n"
+ "st1 { v23.b }[3], [x21], #0x1\n"
+ "st1 { v22.b }[3], [x20], #0x1\n"
+ "st1 { v21.b }[3], [x19], #0x1\n"
+ "4:" // Tail: End
+ "add SP, SP, #0x80\n"
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b52017ce1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 2; };
+
+ kern_type kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ada1818eba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const uint8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x9, #0x0\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v11.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v10.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v9.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v7.4s }, [x19]\n"
+ "lsr x28, %x[n_output_channels], #0x2\n"
+ "cbz x28, 9f\n"
+ "1:" // Output channel loop
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q16, [%x[bias], x19]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q8, [%x[rq_mul_ptr], x19]\n"
+ "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 7f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x28, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x19, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[0], [x19]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x19], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 22f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.h }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.h }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.h }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.h }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.h }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.h }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.h }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.h }[0], [x25]\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v19.h }[0], [x26]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[2], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[2], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v19.b }[2], [x26]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[0], [x25]\n"
+ "st1 { v19.b }[0], [x26]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+
+ "26:" // Done
+
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1bacb5ffe7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8cbbfae00d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x17, #0x0\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x12, x8, #0x3\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v21.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v17.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "cbz x12, 3f\n"
+ "subs x12, x12, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q11, [x19, #0x0]\n"
+ "mov v23.16b, v11.16b\n"
+ "ldr q26, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v12.16b, v11.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v24.16b, v11.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v9.16b, v26.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "mov v22.16b, v26.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "mov v10.16b, v26.16b\n"
+ "ldr d4, [x16, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v17.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v17.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v17.8b\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v17.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v17.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v17.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v17.8b\n"
+ "ssubl v7.8h, v7.8b, v17.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v17.8b\n"
+ "ldr d31, [x23, x17]\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "ldr d27, [x19, x17]\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "usubl v27.8h, v27.8b, v21.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v11.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "add x16, x16, #0x48\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "subs x12, x12, #0x1\n"
+ "smlal v23.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v12.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v22.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v24.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "smlal v11.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v11.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v23.4s, v28.4h, v4.4h\n"
+ "ldr q25, [x13, #0x0]\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ "smlal v12.4s, v28.4h, v2.4h\n"
+ "ldr q16, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ldr q20, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v24.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "smlal v23.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "smlal v12.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "smlal v24.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "smlal v24.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "smlal v11.4s, v30.4h, v8.4h\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "smlal v12.4s, v30.4h, v5.4h\n"
+ "smlal2 v22.4s, v30.8h, v5.8h\n"
+ "smlal v24.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "smlal v11.4s, v29.4h, v3.4h\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal v12.4s, v29.4h, v0.4h\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v23.4s, v28.4h, v5.4h\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "smlal v11.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal v12.4s, v31.4h, v3.4h\n"
+ "smlal2 v22.4s, v31.8h, v3.8h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v12.4s, v29.4h, v7.4h\n"
+ "smlal2 v22.4s, v29.8h, v7.8h\n"
+ "smlal v24.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+ "smlal v12.4s, v28.4h, v8.4h\n"
+ "smlal2 v22.4s, v28.8h, v8.8h\n"
+ "smlal v24.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "and v19.16b, v11.16b, v18.16b\n"
+ "and v5.16b, v26.16b, v20.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+ "sqadd v11.4s, v11.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "and v28.16b, v23.16b, v18.16b\n"
+ "and v8.16b, v9.16b, v20.16b\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "add v11.4s, v11.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v28.4s\n"
+ "smin v11.4s, v11.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "sqadd v9.4s, v9.4s, v8.4s\n"
+ "smax v11.4s, v11.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "uzp1 v11.16b, v11.16b, v26.16b\n"
+ "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x10, x15]\n"
+ "add v23.4s, v23.4s, v13.4s\n"
+ "add v9.4s, v9.4s, v13.4s\n"
+ "and v1.16b, v12.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v9.4s, v9.4s, v14.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v9.4s, v9.4s, v15.4s\n"
+ "sqadd v12.4s, v12.4s, v1.4s\n"
+ "and v0.16b, v22.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x9, x15]\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v24.16b, v18.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "sqadd v22.4s, v22.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v13.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v16.16b, v10.16b, v20.16b\n"
+ "smin v12.4s, v12.4s, v14.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "sqadd v24.4s, v24.4s, v26.4s\n"
+ "smax v12.4s, v12.4s, v15.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v24.4s, v24.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "srshl v10.4s, v10.4s, v20.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "uzp1 v12.16b, v12.16b, v22.16b\n"
+ "add v10.4s, v10.4s, v13.4s\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "str d12, [x28, x15]\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "smax v10.4s, v10.4s, v15.4s\n"
+ "uzp1 v24.16b, v24.16b, v10.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d24, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q11, [x19, #0x0]\n"
+ "mov v23.16b, v11.16b\n"
+ "ldr q26, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v12.16b, v11.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v24.16b, v11.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v9.16b, v26.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "mov v22.16b, v26.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "mov v10.16b, v26.16b\n"
+ "ldr d4, [x16, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v17.8b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v17.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v17.8b\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v17.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v17.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v17.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v17.8b\n"
+ "ssubl v7.8h, v7.8b, v17.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v17.8b\n"
+ "ldr d31, [x23, x17]\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr d30, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "ldr d28, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "ldr d27, [x19, x17]\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "usubl v27.8h, v27.8b, v21.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v11.4s, v31.4h, v4.4h\n"
+ "ldr x21, [x14, #0x28]\n"
+ "tst x8, #0x7\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x20, [x14, #0x30]\n"
+ "smlal v23.4s, v31.4h, v3.4h\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v12.4s, v31.4h, v1.4h\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal2 v22.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v24.4s, v31.4h, v0.4h\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x17]\n"
+ "smlal v11.4s, v30.4h, v0.4h\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x19, x17]\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v11.4s, v28.4h, v5.4h\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v23.4s, v28.4h, v4.4h\n"
+ "ldr q25, [x13, #0x0]\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr q18, [x11, #0x0]\n"
+ "smlal v12.4s, v28.4h, v2.4h\n"
+ "ldr q16, [x13, #0x10]\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ldr q20, [x11, #0x10]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v24.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x17]\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x17]\n"
+ "smlal v23.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "smlal v12.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "smlal v24.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "smlal v24.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x17]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x17]\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x17]\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "smlal v11.4s, v30.4h, v8.4h\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "smlal v12.4s, v30.4h, v5.4h\n"
+ "smlal2 v22.4s, v30.8h, v5.8h\n"
+ "smlal v24.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x17]\n"
+ "smlal v11.4s, v29.4h, v3.4h\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal v12.4s, v29.4h, v0.4h\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal v23.4s, v28.4h, v5.4h\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x19, x17]\n"
+ "add x17, x17, #0x8\n"
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "smlal v11.4s, v31.4h, v6.4h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal v12.4s, v31.4h, v3.4h\n"
+ "smlal2 v22.4s, v31.8h, v3.8h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v12.4s, v29.4h, v7.4h\n"
+ "smlal2 v22.4s, v29.8h, v7.8h\n"
+ "smlal v24.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+ "smlal v12.4s, v28.4h, v8.4h\n"
+ "smlal2 v22.4s, v28.8h, v8.8h\n"
+ "smlal v24.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "and v19.16b, v11.16b, v18.16b\n"
+ "and v5.16b, v26.16b, v20.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+ "sqadd v11.4s, v11.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "and v28.16b, v23.16b, v18.16b\n"
+ "and v8.16b, v9.16b, v20.16b\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "add v11.4s, v11.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v28.4s\n"
+ "smin v11.4s, v11.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "sqadd v9.4s, v9.4s, v8.4s\n"
+ "smax v11.4s, v11.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "uzp1 v11.16b, v11.16b, v26.16b\n"
+ "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x10, x15]\n"
+ "add v23.4s, v23.4s, v13.4s\n"
+ "add v9.4s, v9.4s, v13.4s\n"
+ "and v1.16b, v12.16b, v18.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v9.4s, v9.4s, v14.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v9.4s, v9.4s, v15.4s\n"
+ "sqadd v12.4s, v12.4s, v1.4s\n"
+ "and v0.16b, v22.16b, v20.16b\n"
+ "uzp1 v23.16b, v23.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d23, [x9, x15]\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v26.16b, v24.16b, v18.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "sqadd v22.4s, v22.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v13.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v16.16b, v10.16b, v20.16b\n"
+ "smin v12.4s, v12.4s, v14.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "sqadd v24.4s, v24.4s, v26.4s\n"
+ "smax v12.4s, v12.4s, v15.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "srshl v24.4s, v24.4s, v18.4s\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "srshl v10.4s, v10.4s, v20.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "uzp1 v12.16b, v12.16b, v22.16b\n"
+ "add v10.4s, v10.4s, v13.4s\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "str d12, [x28, x15]\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "smax v10.4s, v10.4s, v15.4s\n"
+ "uzp1 v24.16b, v24.16b, v10.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d24, [x27, x15]\n"
+ "add x15, x15, #0x8\n"
+ "beq 64f\n"
+ "add x16, x16, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v11.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v26.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v26.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v26.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x8, #1, 6f\n"
+ "ld1 { v11.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v11.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v23.16b, v11.16b\n"
+ "ldr d0, [x16, #0x0]\n"
+ "mov v9.16b, v26.16b\n"
+ "ldr d1, [x16, #0x8]\n"
+ "mov v12.16b, v11.16b\n"
+ "ldr d2, [x16, #0x10]\n"
+ "mov v22.16b, v26.16b\n"
+ "ldr d3, [x16, #0x18]\n"
+ "mov v24.16b, v11.16b\n"
+ "ldr d4, [x16, #0x20]\n"
+ "mov v10.16b, v26.16b\n"
+ "ldr d5, [x16, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v17.8b\n"
+ "ldr d6, [x16, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v17.8b\n"
+ "ldr d7, [x16, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v17.8b\n"
+ "ldr d8, [x16, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v17.8b\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "add x23, x23, x17\n"
+ "ssubl v4.8h, v4.8b, v17.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "ssubl v5.8h, v5.8b, v17.8b\n"
+ "ldr x19, [x14, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v17.8b\n"
+ "add x22, x22, x17\n"
+ "ssubl v7.8h, v7.8b, v17.8b\n"
+ "add x21, x21, x17\n"
+ "ssubl v8.8h, v8.8b, v17.8b\n"
+ "add x20, x20, x17\n"
+ "add x19, x19, x17\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x8, #1, 10f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr x21, [x14, #0x28]\n"
+ "add x21, x21, x17\n"
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "usubl v27.8h, v27.8b, v21.8b\n"
+ "smlal v11.4s, v31.4h, v4.4h\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "smlal v23.4s, v31.4h, v3.4h\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "smlal v12.4s, v31.4h, v1.4h\n"
+ "smlal2 v22.4s, v31.8h, v1.8h\n"
+ "smlal v24.4s, v31.4h, v0.4h\n"
+ "smlal2 v10.4s, v31.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v0.4h\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "smlal v11.4s, v28.4h, v5.4h\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "smlal v23.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "smlal v12.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v1.4h\n"
+ "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v31.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[6], [x21]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[4], [x21]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x8, #1, 14f\n"
+ "ld1 { v31.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[2], [x21]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[0], [x21]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr x20, [x14, #0x30]\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "add x20, x20, x17\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "smlal v23.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "smlal v12.4s, v27.4h, v4.4h\n"
+ "smlal2 v22.4s, v27.8h, v4.8h\n"
+ "smlal v24.4s, v27.4h, v3.4h\n"
+ "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x8, #1, 18f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "ldr x26, [x14, #0x38]\n"
+ "smlal v24.4s, v29.4h, v8.4h\n"
+ "add x26, x26, x17\n"
+ "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x8, #1, 22f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "ldr x25, [x14, #0x40]\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "add x25, x25, x17\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v23.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v31.s }[0], [x25], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v31.h }[2], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[6], [x25]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[4], [x25]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x8, #1, 26f\n"
+ "ld1 { v31.h }[0], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[2], [x25]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[0], [x25]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr x19, [x14, #0x48]\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "add x19, x19, x17\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v30.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v30.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x8, #1, 30f\n"
+ "ld1 { v30.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "ldr x24, [x14, #0x50]\n"
+ "smlal v11.4s, v30.4h, v8.4h\n"
+ "add x24, x24, x17\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v23.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "smlal v12.4s, v30.4h, v5.4h\n"
+ "smlal2 v22.4s, v30.8h, v5.8h\n"
+ "smlal v24.4s, v30.4h, v4.4h\n"
+ "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "ldr x23, [x14, #0x58]\n"
+ "smlal v11.4s, v29.4h, v3.4h\n"
+ "add x23, x23, x17\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal v12.4s, v29.4h, v0.4h\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x8, #1, 38f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "ldr x22, [x14, #0x60]\n"
+ "smlal v23.4s, v28.4h, v5.4h\n"
+ "add x22, x22, x17\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x8, #1, 42f\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v21.8b\n"
+ "ldr x21, [x14, #0x68]\n"
+ "smlal v11.4s, v31.4h, v6.4h\n"
+ "add x21, x21, x17\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal v12.4s, v31.4h, v3.4h\n"
+ "smlal2 v22.4s, v31.8h, v3.8h\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x8, #1, 46f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v30.8h, v30.8b, v21.8b\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "add x20, x20, x17\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "smlal v24.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x8, #1, 50f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v29.8h, v29.8b, v21.8b\n"
+ "ldr x19, [x14, #0x78]\n"
+ "smlal v12.4s, v29.4h, v7.4h\n"
+ "add x19, x19, x17\n"
+ "smlal2 v22.4s, v29.8h, v7.8h\n"
+ "smlal v24.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v28.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[6], [x19]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[4], [x19]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x8, #1, 54f\n"
+ "ld1 { v28.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[2], [x19]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[0], [x19]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v28.8h, v28.8b, v21.8b\n"
+ "smlal v12.4s, v28.4h, v8.4h\n"
+ "smlal2 v22.4s, v28.8h, v8.8h\n"
+ "smlal v24.4s, v28.4h, v7.4h\n"
+ "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v25.4s }, [x13], #0x10\n"
+ "ld1 { v18.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 56f\n"
+ "ld1 { v16.d }[0], [x13], #0x8\n"
+ "ld1 { v20.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[2], [x13]\n"
+ "ld1 { v20.s }[2], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[0], [x13]\n"
+ "ld1 { v20.s }[0], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x8, #1, 58f\n"
+ "ld1 { v25.d }[0], [x13], #0x8\n"
+ "ld1 { v18.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v18.s }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v25.s }[0], [x13]\n"
+ "ld1 { v18.s }[0], [x11]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+ "add x10, x10, x15\n"
+ "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+ "add x9, x9, x15\n"
+ "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+ "add x28, x28, x15\n"
+ "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+ "add x27, x27, x15\n"
+ "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+ "and v19.16b, v11.16b, v18.16b\n"
+ "and v5.16b, v26.16b, v20.16b\n"
+ "and v28.16b, v23.16b, v18.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v23.4s, v23.4s, v28.4s\n"
+ "and v8.16b, v9.16b, v20.16b\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v23.4s, v23.4s, v18.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "add v11.4s, v11.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v23.4s, v23.4s, v13.4s\n"
+ "smin v11.4s, v11.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smax v11.4s, v11.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v8.4s\n"
+ "uzp1 v11.16b, v11.16b, v26.16b\n"
+ "and v1.16b, v12.16b, v18.16b\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+ "add v9.4s, v9.4s, v13.4s\n"
+ "sqadd v12.4s, v12.4s, v1.4s\n"
+ "and v0.16b, v22.16b, v20.16b\n"
+ "smin v9.4s, v9.4s, v14.4s\n"
+ "and v26.16b, v24.16b, v18.16b\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "smax v9.4s, v9.4s, v15.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "uzp1 v23.16b, v23.16b, v9.16b\n"
+ "add v12.4s, v12.4s, v13.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v22.4s, v22.4s, v0.4s\n"
+ "smin v12.4s, v12.4s, v14.4s\n"
+ "sqadd v24.4s, v24.4s, v26.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v15.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "srshl v24.4s, v24.4s, v18.4s\n"
+ "and v16.16b, v10.16b, v20.16b\n"
+ "add v22.4s, v22.4s, v13.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "srshl v10.4s, v10.4s, v20.4s\n"
+ "uzp1 v12.16b, v12.16b, v22.16b\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "add v10.4s, v10.4s, v13.4s\n"
+ "smin v10.4s, v10.4s, v14.4s\n"
+ "smax v10.4s, v10.4s, v15.4s\n"
+ "uzp1 v24.16b, v24.16b, v10.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v11.s }[0], [x10], #0x4\n"
+ "st1 { v23.s }[0], [x9], #0x4\n"
+ "st1 { v12.s }[0], [x28], #0x4\n"
+ "st1 { v24.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v11.h }[2], [x10], #0x2\n"
+ "st1 { v23.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x28], #0x2\n"
+ "st1 { v24.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v11.b }[6], [x10], #0x1\n"
+ "st1 { v23.b }[6], [x9], #0x1\n"
+ "st1 { v12.b }[6], [x28], #0x1\n"
+ "st1 { v24.b }[6], [x27], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v11.b }[4], [x10], #0x1\n"
+ "st1 { v23.b }[4], [x9], #0x1\n"
+ "st1 { v12.b }[4], [x28], #0x1\n"
+ "st1 { v24.b }[4], [x27], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x8, #1, 62f\n"
+ "st1 { v11.h }[0], [x10], #0x2\n"
+ "st1 { v23.h }[0], [x9], #0x2\n"
+ "st1 { v12.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v11.b }[2], [x10], #0x1\n"
+ "st1 { v23.b }[2], [x9], #0x1\n"
+ "st1 { v12.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v11.b }[0], [x10], #0x1\n"
+ "st1 { v23.b }[0], [x9], #0x1\n"
+ "st1 { v12.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+
+ "64:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..77861e94f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4e1586b033
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x4, #0x0\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x6, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x7, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x17, x3, #0x3\n"
+ "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v22.16b }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "ld1r { v15.4s }, [x19]\n"
+ "ldp x15, x14, [x21, #0x0]\n"
+ "ldp x13, x12, [x21, #0x10]\n"
+ "cbz x17, 3f\n"
+ "subs x17, x17, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v11.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d0, [x5, #0x0]\n"
+ "ldr d1, [x5, #0x8]\n"
+ "mov v20.16b, v10.16b\n"
+ "ldr d2, [x5, #0x10]\n"
+ "mov v17.16b, v10.16b\n"
+ "ldr d3, [x5, #0x18]\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d4, [x5, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d5, [x5, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d6, [x5, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ldr d7, [x5, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d8, [x5, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ldp x26, x25, [x7, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ldp x24, x23, [x7, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldp x22, x21, [x7, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "ldp x20, x19, [x7, #0x30]\n"
+ "ldr d31, [x26, x4]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr d30, [x25, x4]\n"
+ "ldr d29, [x24, x4]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr d28, [x23, x4]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr d27, [x22, x4]\n"
+ "ldr d26, [x21, x4]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr d25, [x20, x4]\n"
+ "ldr d24, [x19, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v13.4s, v31.4h, v8.4h\n"
+ "ldr x22, [x7, #0x40]\n"
+ "add x5, x5, #0x48\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x21, [x7, #0x48]\n"
+ "subs x17, x17, #0x1\n"
+ "smlal v19.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x7, #0x50]\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x19, [x7, #0x58]\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "ldr x11, [x7, #0x60]\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x7, #0x68]\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x7, #0x70]\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x7, #0x78]\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x7, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x7, #0x88]\n"
+ "smlal v19.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x7, #0x90]\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x21, x4]\n"
+ "smlal v13.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x7, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x22, x4]\n"
+ "smlal v19.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x7, #0xa0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x20, x4]\n"
+ "smlal v13.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x7, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x19, x4]\n"
+ "smlal v19.4s, v24.4h, v0.4h\n"
+ "ldr x21, [x7, #0xb0]\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ldr x20, [x7, #0xb8]\n"
+ "smlal v13.4s, v25.4h, v4.4h\n"
+ "ldr x19, [x7, #0xc0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x11, x4]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr q31, [x8, #0x0]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr q30, [x16, #0x0]\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "ldr q23, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x9, x4]\n"
+ "smlal v19.4s, v29.4h, v4.4h\n"
+ "ldr q9, [x16, #0x10]\n"
+ "add x16, x16, #0x20\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal v13.4s, v27.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x4]\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "smlal v19.4s, v27.4h, v3.4h\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x4]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x4]\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "smlal v13.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x4]\n"
+ "smlal v13.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x4]\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x4]\n"
+ "smlal v19.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v26.4h, v5.4h\n"
+ "smlal2 v21.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x4]\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x4]\n"
+ "smlal v19.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v18.4s, v29.4h, v2.4h\n"
+ "smlal2 v21.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x4]\n"
+ "add x4, x4, #0x8\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "smlal2 v17.4s, v24.8h, v5.8h\n"
+ "smlal v18.4s, v26.4h, v7.4h\n"
+ "smlal2 v21.4s, v26.8h, v7.8h\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "smlal2 v17.4s, v25.8h, v8.8h\n"
+ "smlal v18.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v21.4s, v29.8h, v8.8h\n"
+ "and v27.16b, v13.16b, v30.16b\n"
+ "and v7.16b, v10.16b, v9.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqadd v13.4s, v13.4s, v27.4s\n"
+ "sqadd v10.4s, v10.4s, v7.4s\n"
+ "and v6.16b, v19.16b, v30.16b\n"
+ "and v3.16b, v20.16b, v9.16b\n"
+ "srshl v13.4s, v13.4s, v30.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v14.4s\n"
+ "add v10.4s, v10.4s, v14.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v15.4s\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v9.4s\n"
+ "uzp1 v13.16b, v13.16b, v10.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x15, x6]\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "and v28.16b, v11.16b, v30.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "smin v19.4s, v19.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v15.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v26.16b, v17.16b, v9.16b\n"
+ "uzp1 v19.16b, v19.16b, v20.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d19, [x14, x6]\n"
+ "srshl v11.4s, v11.4s, v30.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v8.16b, v18.16b, v30.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v27.16b, v21.16b, v9.16b\n"
+ "smin v11.4s, v11.4s, v15.4s\n"
+ "srshl v17.4s, v17.4s, v9.4s\n"
+ "sqadd v18.4s, v18.4s, v8.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "smin v17.4s, v17.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "srshl v21.4s, v21.4s, v9.4s\n"
+ "smin v18.4s, v18.4s, v15.4s\n"
+ "uzp1 v11.16b, v11.16b, v17.16b\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x13, x6]\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v18.16b, v18.16b, v21.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x12, x6]\n"
+ "add x6, x6, #0x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q13, [x19, #0x0]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q10, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "mov v11.16b, v13.16b\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d0, [x5, #0x0]\n"
+ "ldr d1, [x5, #0x8]\n"
+ "mov v20.16b, v10.16b\n"
+ "ldr d2, [x5, #0x10]\n"
+ "mov v17.16b, v10.16b\n"
+ "ldr d3, [x5, #0x18]\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d4, [x5, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d5, [x5, #0x28]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d6, [x5, #0x30]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ldr d7, [x5, #0x38]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d8, [x5, #0x40]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ldp x26, x25, [x7, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ldp x24, x23, [x7, #0x10]\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldp x22, x21, [x7, #0x20]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "ldp x20, x19, [x7, #0x30]\n"
+ "ldr d31, [x26, x4]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr d30, [x25, x4]\n"
+ "ldr d29, [x24, x4]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr d28, [x23, x4]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr d27, [x22, x4]\n"
+ "ldr d26, [x21, x4]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr d25, [x20, x4]\n"
+ "ldr d24, [x19, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v13.4s, v31.4h, v8.4h\n"
+ "ldr x22, [x7, #0x40]\n"
+ "tst x3, #0x7\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "ldr x21, [x7, #0x48]\n"
+ "smlal v19.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x7, #0x50]\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "ldr x19, [x7, #0x58]\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "ldr x11, [x7, #0x60]\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "ldr x10, [x7, #0x68]\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "ldr x9, [x7, #0x70]\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x7, #0x78]\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x27, [x7, #0x80]\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x26, [x7, #0x88]\n"
+ "smlal v19.4s, v28.4h, v1.4h\n"
+ "ldr x25, [x7, #0x90]\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x21, x4]\n"
+ "smlal v13.4s, v29.4h, v1.4h\n"
+ "ldr x24, [x7, #0x98]\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x22, x4]\n"
+ "smlal v19.4s, v27.4h, v2.4h\n"
+ "ldr x23, [x7, #0xa0]\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x20, x4]\n"
+ "smlal v13.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x7, #0xa8]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x19, x4]\n"
+ "smlal v19.4s, v24.4h, v0.4h\n"
+ "ldr x21, [x7, #0xb0]\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "ldr x20, [x7, #0xb8]\n"
+ "smlal v13.4s, v25.4h, v4.4h\n"
+ "ldr x19, [x7, #0xc0]\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x11, x4]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr q31, [x8, #0x0]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr q30, [x16, #0x0]\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "ldr q23, [x8, #0x10]\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x9, x4]\n"
+ "smlal v19.4s, v29.4h, v4.4h\n"
+ "ldr q9, [x16, #0x10]\n"
+ "add x16, x16, #0x20\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x10, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal v13.4s, v27.4h, v5.4h\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x27, x4]\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "smlal v19.4s, v27.4h, v3.4h\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x28, x4]\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x26, x4]\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "smlal v13.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x25, x4]\n"
+ "smlal v13.4s, v24.4h, v7.4h\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x4]\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x4]\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x4]\n"
+ "smlal v19.4s, v28.4h, v7.4h\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v26.4h, v5.4h\n"
+ "smlal2 v21.4s, v26.8h, v5.8h\n"
+ "ldr d26, [x21, x4]\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x4]\n"
+ "smlal v19.4s, v29.4h, v8.4h\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v18.4s, v29.4h, v2.4h\n"
+ "smlal2 v21.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x19, x4]\n"
+ "add x4, x4, #0x8\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "smlal2 v17.4s, v24.8h, v5.8h\n"
+ "smlal v18.4s, v26.4h, v7.4h\n"
+ "smlal2 v21.4s, v26.8h, v7.8h\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "smlal2 v17.4s, v25.8h, v8.8h\n"
+ "smlal v18.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v21.4s, v29.8h, v8.8h\n"
+ "and v27.16b, v13.16b, v30.16b\n"
+ "and v7.16b, v10.16b, v9.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqadd v13.4s, v13.4s, v27.4s\n"
+ "sqadd v10.4s, v10.4s, v7.4s\n"
+ "and v6.16b, v19.16b, v30.16b\n"
+ "and v3.16b, v20.16b, v9.16b\n"
+ "srshl v13.4s, v13.4s, v30.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v14.4s\n"
+ "add v10.4s, v10.4s, v14.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v15.4s\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v9.4s\n"
+ "uzp1 v13.16b, v13.16b, v10.16b\n"
+ "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x15, x6]\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "and v28.16b, v11.16b, v30.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "smin v19.4s, v19.4s, v15.4s\n"
+ "smin v20.4s, v20.4s, v15.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v26.16b, v17.16b, v9.16b\n"
+ "uzp1 v19.16b, v19.16b, v20.16b\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d19, [x14, x6]\n"
+ "srshl v11.4s, v11.4s, v30.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v8.16b, v18.16b, v30.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v27.16b, v21.16b, v9.16b\n"
+ "smin v11.4s, v11.4s, v15.4s\n"
+ "srshl v17.4s, v17.4s, v9.4s\n"
+ "sqadd v18.4s, v18.4s, v8.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "smin v17.4s, v17.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "srshl v21.4s, v21.4s, v9.4s\n"
+ "smin v18.4s, v18.4s, v15.4s\n"
+ "uzp1 v11.16b, v11.16b, v17.16b\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d11, [x13, x6]\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v18.16b, v18.16b, v21.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d18, [x12, x6]\n"
+ "add x6, x6, #0x8\n"
+ "beq 88f\n"
+ "add x5, x5, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x3, #2, 5f\n"
+ "ld1 { v13.4s }, [x19], #0x10\n"
+ "tbz x3, #1, 4f\n"
+ "ld1 { v10.d }[0], [x19], #0x8\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v10.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x3, #0, 7f\n"
+ "ld1 { v10.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x3, #1, 6f\n"
+ "ld1 { v13.d }[0], [x19], #0x8\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v13.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 7f\n"
+ "ld1 { v13.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v19.16b, v13.16b\n"
+ "ldr d0, [x5, #0x0]\n"
+ "mov v20.16b, v10.16b\n"
+ "ldr d1, [x5, #0x8]\n"
+ "mov v11.16b, v13.16b\n"
+ "ldr d2, [x5, #0x10]\n"
+ "mov v17.16b, v10.16b\n"
+ "ldr d3, [x5, #0x18]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d4, [x5, #0x20]\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d5, [x5, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d6, [x5, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d7, [x5, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ldr d8, [x5, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldp x26, x25, [x7, #0x0]\n"
+ "add x26, x26, x4\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ldp x24, x23, [x7, #0x10]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ldp x22, x21, [x7, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "add x25, x25, x4\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldp x20, x19, [x7, #0x30]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "add x24, x24, x4\n"
+ "add x23, x23, x4\n"
+ "add x22, x22, x4\n"
+ "add x21, x21, x4\n"
+ "add x20, x20, x4\n"
+ "add x19, x19, x4\n"
+ "tbz x3, #2, 9f\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v30.s }[0], [x25], #0x4\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x3, #1, 8f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v30.b }[6], [x25]\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x3, #0, 11f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v30.b }[4], [x25]\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x3, #1, 10f\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v30.h }[0], [x25], #0x2\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v30.b }[2], [x25]\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 11f\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v30.b }[0], [x25]\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x22, [x7, #0x40]\n"
+ "add x22, x22, x4\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "smlal v13.4s, v31.4h, v8.4h\n"
+ "smlal2 v10.4s, v31.8h, v8.8h\n"
+ "smlal v19.4s, v31.4h, v6.4h\n"
+ "smlal2 v20.4s, v31.8h, v6.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v19.4s, v28.4h, v1.4h\n"
+ "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal v13.4s, v29.4h, v1.4h\n"
+ "smlal2 v10.4s, v29.8h, v1.8h\n"
+ "smlal v19.4s, v27.4h, v2.4h\n"
+ "smlal2 v20.4s, v27.8h, v2.8h\n"
+ "smlal v13.4s, v26.4h, v3.4h\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal v19.4s, v24.4h, v0.4h\n"
+ "smlal2 v20.4s, v24.8h, v0.8h\n"
+ "smlal v13.4s, v25.4h, v4.4h\n"
+ "smlal2 v10.4s, v25.8h, v4.8h\n"
+ "smlal v13.4s, v24.4h, v2.4h\n"
+ "smlal2 v10.4s, v24.8h, v2.8h\n"
+ "tbz x3, #2, 13f\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "tbz x3, #1, 12f\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 15f\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x3, #1, 14f\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 15f\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x21, [x7, #0x48]\n"
+ "smlal v19.4s, v29.4h, v4.4h\n"
+ "add x21, x21, x4\n"
+ "smlal2 v20.4s, v29.8h, v4.8h\n"
+ "tbz x3, #2, 17f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x3, #1, 16f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 19f\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x3, #1, 18f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 19f\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x20, [x7, #0x50]\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "add x20, x20, x4\n"
+ "smlal2 v20.4s, v28.8h, v5.8h\n"
+ "tbz x3, #2, 21f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 20f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 23f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 23f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x3, #1, 22f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 23f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 23f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "ldr x19, [x7, #0x58]\n"
+ "smlal v13.4s, v27.4h, v5.4h\n"
+ "add x19, x19, x4\n"
+ "smlal2 v10.4s, v27.8h, v5.8h\n"
+ "smlal v19.4s, v27.4h, v3.4h\n"
+ "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "tbz x3, #2, 25f\n"
+ "ld1 { v26.s }[0], [x19], #0x4\n"
+ "tbz x3, #1, 24f\n"
+ "ld1 { v26.h }[2], [x19], #0x2\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v26.b }[6], [x19]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 27f\n"
+ "ld1 { v26.b }[4], [x19]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x3, #1, 26f\n"
+ "ld1 { v26.h }[0], [x19], #0x2\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v26.b }[2], [x19]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 27f\n"
+ "ld1 { v26.b }[0], [x19]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "ldr x11, [x7, #0x60]\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "add x11, x11, x4\n"
+ "smlal2 v17.4s, v26.8h, v3.8h\n"
+ "tbz x3, #2, 29f\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "tbz x3, #1, 28f\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v25.b }[6], [x11]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 31f\n"
+ "ld1 { v25.b }[4], [x11]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x3, #1, 30f\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v25.b }[2], [x11]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 31f\n"
+ "ld1 { v25.b }[0], [x11]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "ldr x10, [x7, #0x68]\n"
+ "smlal v13.4s, v25.4h, v6.4h\n"
+ "add x10, x10, x4\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "tbz x3, #2, 33f\n"
+ "ld1 { v29.s }[0], [x10], #0x4\n"
+ "tbz x3, #1, 32f\n"
+ "ld1 { v29.h }[2], [x10], #0x2\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v29.b }[6], [x10]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 35f\n"
+ "ld1 { v29.b }[4], [x10]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x3, #1, 34f\n"
+ "ld1 { v29.h }[0], [x10], #0x2\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v29.b }[2], [x10]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 35f\n"
+ "ld1 { v29.b }[0], [x10]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x9, [x7, #0x70]\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "add x9, x9, x4\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "tbz x3, #2, 37f\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "tbz x3, #1, 36f\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 39f\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x3, #1, 38f\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 39f\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "ldr x28, [x7, #0x78]\n"
+ "smlal v13.4s, v24.4h, v7.4h\n"
+ "add x28, x28, x4\n"
+ "smlal2 v10.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "tbz x3, #2, 41f\n"
+ "ld1 { v27.s }[0], [x28], #0x4\n"
+ "tbz x3, #1, 40f\n"
+ "ld1 { v27.h }[2], [x28], #0x2\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v27.b }[6], [x28]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 43f\n"
+ "ld1 { v27.b }[4], [x28]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x3, #1, 42f\n"
+ "ld1 { v27.h }[0], [x28], #0x2\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v27.b }[2], [x28]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 43f\n"
+ "ld1 { v27.b }[0], [x28]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "ldr x27, [x7, #0x80]\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "add x27, x27, x4\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "tbz x3, #2, 45f\n"
+ "ld1 { v28.s }[0], [x27], #0x4\n"
+ "tbz x3, #1, 44f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 47f\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x3, #1, 46f\n"
+ "ld1 { v28.h }[0], [x27], #0x2\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 47f\n"
+ "ld1 { v28.b }[0], [x27]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x26, [x7, #0x88]\n"
+ "smlal v19.4s, v28.4h, v7.4h\n"
+ "add x26, x26, x4\n"
+ "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "tbz x3, #2, 49f\n"
+ "ld1 { v26.s }[0], [x26], #0x4\n"
+ "tbz x3, #1, 48f\n"
+ "ld1 { v26.h }[2], [x26], #0x2\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v26.b }[6], [x26]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 51f\n"
+ "ld1 { v26.b }[4], [x26]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x3, #1, 50f\n"
+ "ld1 { v26.h }[0], [x26], #0x2\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v26.b }[2], [x26]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 51f\n"
+ "ld1 { v26.b }[0], [x26]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "ldr x25, [x7, #0x90]\n"
+ "smlal v18.4s, v26.4h, v5.4h\n"
+ "add x25, x25, x4\n"
+ "smlal2 v21.4s, v26.8h, v5.8h\n"
+ "tbz x3, #2, 53f\n"
+ "ld1 { v25.s }[0], [x25], #0x4\n"
+ "tbz x3, #1, 52f\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 55f\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x3, #1, 54f\n"
+ "ld1 { v25.h }[0], [x25], #0x2\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 55f\n"
+ "ld1 { v25.b }[0], [x25]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "ldr x24, [x7, #0x98]\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "add x24, x24, x4\n"
+ "smlal2 v17.4s, v25.8h, v6.8h\n"
+ "tbz x3, #2, 57f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x3, #1, 56f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 59f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x3, #1, 58f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 59f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x23, [x7, #0xa0]\n"
+ "smlal v19.4s, v29.4h, v8.4h\n"
+ "add x23, x23, x4\n"
+ "smlal2 v20.4s, v29.8h, v8.8h\n"
+ "smlal v18.4s, v29.4h, v2.4h\n"
+ "smlal2 v21.4s, v29.8h, v2.8h\n"
+ "tbz x3, #2, 61f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x3, #1, 60f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 63f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x3, #1, 62f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 63f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "ldr x22, [x7, #0xa8]\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "add x22, x22, x4\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "tbz x3, #2, 65f\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "tbz x3, #1, 64f\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 67f\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x3, #1, 66f\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 67f\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v24.8h, v24.8b, v22.8b\n"
+ "ldr x21, [x7, #0xb0]\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "add x21, x21, x4\n"
+ "smlal2 v17.4s, v24.8h, v5.8h\n"
+ "smlal v18.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "tbz x3, #2, 69f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x3, #1, 68f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 71f\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x3, #1, 70f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 71f\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "usubl v26.8h, v26.8b, v22.8b\n"
+ "ldr x20, [x7, #0xb8]\n"
+ "smlal v18.4s, v26.4h, v7.4h\n"
+ "add x20, x20, x4\n"
+ "smlal2 v21.4s, v26.8h, v7.8h\n"
+ "tbz x3, #2, 73f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 72f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 75f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x3, #1, 74f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 75f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "usubl v25.8h, v25.8b, v22.8b\n"
+ "ldr x19, [x7, #0xc0]\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "add x19, x19, x4\n"
+ "smlal2 v17.4s, v25.8h, v8.8h\n"
+ "smlal v18.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "tbz x3, #2, 77f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x3, #1, 76f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v29.b }[6], [x19]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x3, #0, 79f\n"
+ "ld1 { v29.b }[4], [x19]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x3, #1, 78f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v29.b }[2], [x19]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 79f\n"
+ "ld1 { v29.b }[0], [x19]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v21.4s, v29.8h, v8.8h\n"
+ "tbz x3, #2, 81f\n"
+ "ld1 { v31.4s }, [x8], #0x10\n"
+ "ld1 { v30.4s }, [x16], #0x10\n"
+ "tbz x3, #1, 80f\n"
+ "ld1 { v23.d }[0], [x8], #0x8\n"
+ "ld1 { v9.d }[0], [x16], #0x8\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v23.s }[2], [x8]\n"
+ "ld1 { v9.s }[2], [x16]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x3, #0, 83f\n"
+ "ld1 { v23.s }[0], [x8]\n"
+ "ld1 { v9.s }[0], [x16]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x3, #1, 82f\n"
+ "ld1 { v31.d }[0], [x8], #0x8\n"
+ "ld1 { v30.d }[0], [x16], #0x8\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v31.s }[2], [x8]\n"
+ "ld1 { v30.s }[2], [x16]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 83f\n"
+ "ld1 { v31.s }[0], [x8]\n"
+ "ld1 { v30.s }[0], [x16]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "add x15, x15, x6\n"
+ "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+ "add x14, x14, x6\n"
+ "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+ "add x13, x13, x6\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "add x12, x12, x6\n"
+ "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+ "and v27.16b, v13.16b, v30.16b\n"
+ "and v7.16b, v10.16b, v9.16b\n"
+ "and v6.16b, v19.16b, v30.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v27.4s\n"
+ "sqadd v10.4s, v10.4s, v7.4s\n"
+ "sqadd v19.4s, v19.4s, v6.4s\n"
+ "and v3.16b, v20.16b, v9.16b\n"
+ "srshl v13.4s, v13.4s, v30.4s\n"
+ "srshl v10.4s, v10.4s, v9.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "add v13.4s, v13.4s, v14.4s\n"
+ "add v10.4s, v10.4s, v14.4s\n"
+ "add v19.4s, v19.4s, v14.4s\n"
+ "smin v13.4s, v13.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v15.4s\n"
+ "smin v19.4s, v19.4s, v15.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "uzp1 v13.16b, v13.16b, v10.16b\n"
+ "and v28.16b, v11.16b, v30.16b\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "srshl v20.4s, v20.4s, v9.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "sqadd v11.4s, v11.4s, v28.4s\n"
+ "and v26.16b, v17.16b, v9.16b\n"
+ "smin v20.4s, v20.4s, v15.4s\n"
+ "and v8.16b, v18.16b, v30.16b\n"
+ "srshl v11.4s, v11.4s, v30.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "uzp1 v19.16b, v19.16b, v20.16b\n"
+ "add v11.4s, v11.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "sqadd v17.4s, v17.4s, v26.4s\n"
+ "smin v11.4s, v11.4s, v15.4s\n"
+ "sqadd v18.4s, v18.4s, v8.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "srshl v17.4s, v17.4s, v9.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "and v27.16b, v21.16b, v9.16b\n"
+ "add v17.4s, v17.4s, v14.4s\n"
+ "add v18.4s, v18.4s, v14.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smin v17.4s, v17.4s, v15.4s\n"
+ "smin v18.4s, v18.4s, v15.4s\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "srshl v21.4s, v21.4s, v9.4s\n"
+ "uzp1 v11.16b, v11.16b, v17.16b\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v18.16b, v18.16b, v21.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "tbz x3, #2, 85f\n"
+ "st1 { v13.s }[0], [x15], #0x4\n"
+ "st1 { v19.s }[0], [x14], #0x4\n"
+ "st1 { v11.s }[0], [x13], #0x4\n"
+ "st1 { v18.s }[0], [x12], #0x4\n"
+ "tbz x3, #1, 84f\n"
+ "st1 { v13.h }[2], [x15], #0x2\n"
+ "st1 { v19.h }[2], [x14], #0x2\n"
+ "st1 { v11.h }[2], [x13], #0x2\n"
+ "st1 { v18.h }[2], [x12], #0x2\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v13.b }[6], [x15], #0x1\n"
+ "st1 { v19.b }[6], [x14], #0x1\n"
+ "st1 { v11.b }[6], [x13], #0x1\n"
+ "st1 { v18.b }[6], [x12], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x3, #0, 87f\n"
+ "st1 { v13.b }[4], [x15], #0x1\n"
+ "st1 { v19.b }[4], [x14], #0x1\n"
+ "st1 { v11.b }[4], [x13], #0x1\n"
+ "st1 { v18.b }[4], [x12], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x3, #1, 86f\n"
+ "st1 { v13.h }[0], [x15], #0x2\n"
+ "st1 { v19.h }[0], [x14], #0x2\n"
+ "st1 { v11.h }[0], [x13], #0x2\n"
+ "st1 { v18.h }[0], [x12], #0x2\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v13.b }[2], [x15], #0x1\n"
+ "st1 { v19.b }[2], [x14], #0x1\n"
+ "st1 { v11.b }[2], [x13], #0x1\n"
+ "st1 { v18.b }[2], [x12], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x3, #0, 87f\n"
+ "st1 { v13.b }[0], [x15], #0x1\n"
+ "st1 { v19.b }[0], [x14], #0x1\n"
+ "st1 { v11.b }[0], [x13], #0x1\n"
+ "st1 { v18.b }[0], [x12], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+
+ "88:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d3d5000d4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..97156137bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "mov x10, #0x0\n"
+ "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x1, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "lsr x19, x4, #0x3\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v9.16b }, [x13]\n"
+ "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.16b }, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v10.4s }, [x8]\n"
+ "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v13.4s }, [x8]\n"
+ "ldp x17, x16, [x21, #0x0]\n"
+ "ldp x6, x8, [x21, #0x10]\n"
+ "cbz x19, 3f\n"
+ "subs x19, x19, #0x1\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v16.16b, v15.16b\n"
+ "ldr q18, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v7.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v21.16b, v18.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "mov v17.16b, v18.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v5.16b, v18.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "ldr d27, [x24, x10]\n"
+ "ldr d23, [x23, x10]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "subs x19, x19, #0x1\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "smlal v7.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v8.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "smlal v16.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v7.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v18.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v7.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v17.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "ldr q19, [x5, #0x0]\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr q20, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal v7.4s, v31.4h, v3.4h\n"
+ "ldr q12, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v7.4s, v30.4h, v4.4h\n"
+ "smlal2 v17.4s, v30.8h, v4.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v18.4s, v29.8h, v0.8h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v7.4s, v22.4h, v0.4h\n"
+ "smlal2 v17.4s, v22.8h, v0.8h\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "smlal v16.4s, v23.4h, v1.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v7.4s, v25.4h, v1.4h\n"
+ "smlal2 v17.4s, v25.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v18.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v7.4s, v24.4h, v2.4h\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "smlal v16.4s, v30.4h, v3.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v7.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "smlal v8.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v18.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "smlal v7.4s, v23.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v18.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal v7.4s, v31.4h, v0.4h\n"
+ "smlal2 v17.4s, v31.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v18.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "smlal v7.4s, v30.4h, v1.4h\n"
+ "smlal2 v17.4s, v30.8h, v1.8h\n"
+ "smlal v8.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "smlal v16.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal v7.4s, v26.4h, v2.4h\n"
+ "smlal2 v17.4s, v26.8h, v2.8h\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v18.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "smlal v16.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v18.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "smlal v8.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "smlal v7.4s, v27.4h, v0.4h\n"
+ "smlal2 v17.4s, v27.8h, v0.8h\n"
+ "smlal v8.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "smlal v16.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal2 v17.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v18.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "smlal v16.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v17.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "add x3, x3, #0xc8\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v18.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "smlal v16.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "smlal v8.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v18.4s, v23.8h, v1.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v18.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v18.4s, v30.8h, v3.8h\n"
+ "smlal v16.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "and v28.16b, v15.16b, v19.16b\n"
+ "and v26.16b, v18.16b, v12.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "sqadd v15.4s, v15.4s, v28.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "and v29.16b, v16.16b, v19.16b\n"
+ "and v4.16b, v21.16b, v12.16b\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v12.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v10.4s\n"
+ "add v18.4s, v18.4s, v10.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "smin v15.4s, v15.4s, v13.4s\n"
+ "smin v18.4s, v18.4s, v13.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "smax v15.4s, v15.4s, v11.4s\n"
+ "smax v18.4s, v18.4s, v11.4s\n"
+ "srshl v16.4s, v16.4s, v19.4s\n"
+ "srshl v21.4s, v21.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "add v16.4s, v16.4s, v10.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "and v25.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v13.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v11.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "and v31.16b, v17.16b, v12.16b\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x16, x1]\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v24.16b, v8.16b, v19.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "add v7.4s, v7.4s, v10.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v1.16b, v5.16b, v12.16b\n"
+ "smin v7.4s, v7.4s, v13.4s\n"
+ "srshl v17.4s, v17.4s, v12.4s\n"
+ "sqadd v8.4s, v8.4s, v24.4s\n"
+ "smax v7.4s, v7.4s, v11.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "add v17.4s, v17.4s, v10.4s\n"
+ "srshl v8.4s, v8.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v1.4s\n"
+ "smin v17.4s, v17.4s, v13.4s\n"
+ "add v8.4s, v8.4s, v10.4s\n"
+ "smax v17.4s, v17.4s, v11.4s\n"
+ "srshl v5.4s, v5.4s, v12.4s\n"
+ "smin v8.4s, v8.4s, v13.4s\n"
+ "uzp1 v7.16b, v7.16b, v17.16b\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x6, x1]\n"
+ "smax v8.4s, v8.4s, v11.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smax v5.4s, v5.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v5.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x12, #0x0]\n"
+ "mov v16.16b, v15.16b\n"
+ "ldr q18, [x12, #0x10]\n"
+ "add x12, x12, #0x20\n"
+ "mov v7.16b, v15.16b\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v21.16b, v18.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "mov v17.16b, v18.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v5.16b, v18.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "ldr d31, [x28, x10]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr d30, [x27, x10]\n"
+ "ldr d29, [x26, x10]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d28, [x13, x10]\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "ldr d27, [x24, x10]\n"
+ "ldr d23, [x23, x10]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d25, [x22, x10]\n"
+ "ldr d24, [x21, x10]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d26, [x20, x10]\n"
+ "ldr d22, [x0, x10]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x20, [x25, #0x50]\n"
+ "tst x4, #0x7\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "ldr d31, [x20, x10]\n"
+ "smlal v7.4s, v29.4h, v0.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "smlal v8.4s, v28.4h, v0.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d0, [x3, #0x28]\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "smlal v16.4s, v27.4h, v1.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "smlal v7.4s, v28.4h, v1.4h\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d1, [x3, #0x30]\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "smlal2 v18.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x0, x10]\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "smlal v7.4s, v23.4h, v2.4h\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal2 v17.4s, v23.8h, v2.8h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x7, x10]\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d2, [x3, #0x38]\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "ldr q6, [x2, #0x0]\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "ldr q19, [x5, #0x0]\n"
+ "smlal v7.4s, v31.4h, v3.4h\n"
+ "ldr q20, [x2, #0x10]\n"
+ "add x2, x2, #0x20\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "ldr q12, [x5, #0x10]\n"
+ "add x5, x5, #0x20\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x26, x10]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x23, x10]\n"
+ "smlal v7.4s, v30.4h, v4.4h\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal2 v17.4s, v30.8h, v4.8h\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0x48]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v18.4s, v29.8h, v0.8h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v7.4s, v22.4h, v0.4h\n"
+ "smlal2 v17.4s, v22.8h, v0.8h\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x22, x10]\n"
+ "smlal v16.4s, v23.4h, v1.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v7.4s, v25.4h, v1.4h\n"
+ "smlal2 v17.4s, v25.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v18.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x20, x10]\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v7.4s, v24.4h, v2.4h\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d2, [x3, #0x60]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "ldr d31, [x13, x10]\n"
+ "smlal v16.4s, v30.4h, v3.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v7.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "smlal v8.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v18.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x10]\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x14, x10]\n"
+ "smlal v7.4s, v23.4h, v4.4h\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v18.4s, v22.8h, v0.8h\n"
+ "ldr d22, [x0, x10]\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal v7.4s, v31.4h, v0.4h\n"
+ "smlal2 v17.4s, v31.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v18.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x11, x10]\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "smlal v7.4s, v30.4h, v1.4h\n"
+ "smlal2 v17.4s, v30.8h, v1.8h\n"
+ "smlal v8.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ldr d1, [x3, #0x80]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x24, x10]\n"
+ "smlal v16.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal v7.4s, v26.4h, v2.4h\n"
+ "smlal2 v17.4s, v26.8h, v2.8h\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v18.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x15, x10]\n"
+ "smlal v16.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v18.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x10]\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x12, x10]\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "smlal v8.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ldr d4, [x3, #0x98]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x27, x10]\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "smlal v7.4s, v27.4h, v0.4h\n"
+ "smlal2 v17.4s, v27.8h, v0.8h\n"
+ "smlal v8.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x28, x10]\n"
+ "smlal v16.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal2 v17.4s, v23.8h, v1.8h\n"
+ "smlal v8.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v18.4s, v26.8h, v2.8h\n"
+ "ldr d26, [x7, x10]\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal v8.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x26, x10]\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "smlal v8.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x10]\n"
+ "smlal v16.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v17.4s, v28.8h, v4.8h\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v18.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x10]\n"
+ "smlal v16.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x20, x10]\n"
+ "smlal v8.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v18.4s, v23.8h, v1.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x13, x10]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v18.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x10]\n"
+ "add x10, x10, #0x8\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v18.4s, v30.8h, v3.8h\n"
+ "smlal v16.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "and v28.16b, v15.16b, v19.16b\n"
+ "and v26.16b, v18.16b, v12.16b\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "sqadd v15.4s, v15.4s, v28.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "and v29.16b, v16.16b, v19.16b\n"
+ "and v4.16b, v21.16b, v12.16b\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v12.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v10.4s\n"
+ "add v18.4s, v18.4s, v10.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "smin v15.4s, v15.4s, v13.4s\n"
+ "smin v18.4s, v18.4s, v13.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "smax v15.4s, v15.4s, v11.4s\n"
+ "smax v18.4s, v18.4s, v11.4s\n"
+ "srshl v16.4s, v16.4s, v19.4s\n"
+ "srshl v21.4s, v21.4s, v12.4s\n"
+ "uzp1 v15.16b, v15.16b, v18.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x17, x1]\n"
+ "add v16.4s, v16.4s, v10.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "and v25.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v13.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "smax v16.4s, v16.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v11.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "and v31.16b, v17.16b, v12.16b\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d16, [x16, x1]\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v24.16b, v8.16b, v19.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "add v7.4s, v7.4s, v10.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v1.16b, v5.16b, v12.16b\n"
+ "smin v7.4s, v7.4s, v13.4s\n"
+ "srshl v17.4s, v17.4s, v12.4s\n"
+ "sqadd v8.4s, v8.4s, v24.4s\n"
+ "smax v7.4s, v7.4s, v11.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "add v17.4s, v17.4s, v10.4s\n"
+ "srshl v8.4s, v8.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v1.4s\n"
+ "smin v17.4s, v17.4s, v13.4s\n"
+ "add v8.4s, v8.4s, v10.4s\n"
+ "smax v17.4s, v17.4s, v11.4s\n"
+ "srshl v5.4s, v5.4s, v12.4s\n"
+ "smin v8.4s, v8.4s, v13.4s\n"
+ "uzp1 v7.16b, v7.16b, v17.16b\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x6, x1]\n"
+ "smax v8.4s, v8.4s, v11.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smax v5.4s, v5.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v5.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x8, x1]\n"
+ "add x1, x1, #0x8\n"
+ "beq 124f\n"
+ "add x3, x3, #0xc8\n"
+ "3:" // Oddments
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v15.4s }, [x12], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v18.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v18.s }[2], [x12]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v18.s }[0], [x12]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v15.d }[0], [x12], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[2], [x12]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v15.s }[0], [x12]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "mov v16.16b, v15.16b\n"
+ "ldr d0, [x3, #0x0]\n"
+ "mov v21.16b, v18.16b\n"
+ "ldr d1, [x3, #0x8]\n"
+ "mov v7.16b, v15.16b\n"
+ "ldr d2, [x3, #0x10]\n"
+ "mov v17.16b, v18.16b\n"
+ "ldr d3, [x3, #0x18]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr d4, [x3, #0x20]\n"
+ "mov v5.16b, v18.16b\n"
+ "ldp x28, x27, [x25, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldp x26, x13, [x25, #0x10]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldp x24, x23, [x25, #0x20]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldp x22, x21, [x25, #0x30]\n"
+ "ldp x20, x0, [x25, #0x40]\n"
+ "add x28, x28, x10\n"
+ "add x27, x27, x10\n"
+ "add x26, x26, x10\n"
+ "add x13, x13, x10\n"
+ "add x24, x24, x10\n"
+ "add x23, x23, x10\n"
+ "add x22, x22, x10\n"
+ "add x21, x21, x10\n"
+ "add x20, x20, x10\n"
+ "add x0, x0, x10\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v29.s }[0], [x26], #0x4\n"
+ "ld1 { v28.s }[0], [x13], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x13], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x13]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x13]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v29.h }[0], [x26], #0x2\n"
+ "ld1 { v28.h }[0], [x13], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x21], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x13]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v29.b }[0], [x26]\n"
+ "ld1 { v28.b }[0], [x13]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x21]\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x20, [x25, #0x50]\n"
+ "add x20, x20, x10\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "smlal v7.4s, v29.4h, v0.4h\n"
+ "smlal2 v17.4s, v29.8h, v0.8h\n"
+ "smlal v8.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "smlal v16.4s, v27.4h, v1.4h\n"
+ "smlal2 v21.4s, v27.8h, v1.8h\n"
+ "smlal v7.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "smlal v8.4s, v23.4h, v1.4h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "smlal v15.4s, v27.4h, v2.4h\n"
+ "smlal2 v18.4s, v27.8h, v2.8h\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v23.4h, v2.4h\n"
+ "smlal2 v17.4s, v23.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x28, [x25, #0x58]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "add x28, x28, x10\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v7.4s, v31.4h, v3.4h\n"
+ "smlal2 v17.4s, v31.8h, v3.8h\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr x0, [x25, #0x60]\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "add x0, x0, x10\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x0]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x0]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x0]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x0]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d0, [x3, #0x28]\n"
+ "smlal v7.4s, v30.4h, v4.4h\n"
+ "ldr x7, [x25, #0x68]\n"
+ "add x7, x7, x10\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal2 v17.4s, v30.8h, v4.8h\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "smlal2 v18.4s, v29.8h, v0.8h\n"
+ "smlal v16.4s, v28.4h, v0.4h\n"
+ "smlal2 v21.4s, v28.8h, v0.8h\n"
+ "smlal v7.4s, v22.4h, v0.4h\n"
+ "smlal2 v17.4s, v22.8h, v0.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v25.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v25.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[6], [x7]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[4], [x7]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v25.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[2], [x7]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[0], [x7]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d1, [x3, #0x30]\n"
+ "smlal v8.4s, v25.4h, v0.4h\n"
+ "ldr x26, [x25, #0x70]\n"
+ "add x26, x26, x10\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "smlal v16.4s, v23.4h, v1.4h\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v7.4s, v25.4h, v1.4h\n"
+ "smlal2 v17.4s, v25.8h, v1.8h\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v24.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v24.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[0], [x26]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d2, [x3, #0x38]\n"
+ "smlal v8.4s, v24.4h, v1.4h\n"
+ "ldr x23, [x25, #0x78]\n"
+ "add x23, x23, x10\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "smlal2 v18.4s, v23.8h, v2.8h\n"
+ "smlal v16.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v7.4s, v24.4h, v2.4h\n"
+ "smlal2 v17.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v27.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v27.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[0], [x23]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d3, [x3, #0x40]\n"
+ "smlal v8.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x25, #0x80]\n"
+ "add x20, x20, x10\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v18.4s, v31.8h, v3.8h\n"
+ "smlal v16.4s, v30.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v7.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d4, [x3, #0x48]\n"
+ "smlal v8.4s, v23.4h, v3.4h\n"
+ "ldr x22, [x25, #0x88]\n"
+ "add x22, x22, x10\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v18.4s, v30.8h, v4.8h\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v4.4h\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d0, [x3, #0x50]\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "ldr x13, [x25, #0x90]\n"
+ "add x13, x13, x10\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "smlal2 v18.4s, v22.8h, v0.8h\n"
+ "smlal v16.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v31.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v31.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[6], [x13]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[4], [x13]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v31.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[2], [x13]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[0], [x13]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x21, [x25, #0x98]\n"
+ "smlal v7.4s, v31.4h, v0.4h\n"
+ "add x21, x21, x10\n"
+ "smlal2 v17.4s, v31.8h, v0.8h\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d1, [x3, #0x58]\n"
+ "smlal v8.4s, v30.4h, v0.4h\n"
+ "ldr x14, [x25, #0xa0]\n"
+ "add x14, x14, x10\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v25.4h, v1.4h\n"
+ "smlal2 v18.4s, v25.8h, v1.8h\n"
+ "smlal v16.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "smlal v7.4s, v30.4h, v1.4h\n"
+ "smlal2 v17.4s, v30.8h, v1.8h\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v26.s }[0], [x14], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v26.h }[2], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[6], [x14]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[4], [x14]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v26.h }[0], [x14], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[2], [x14]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[0], [x14]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr d2, [x3, #0x60]\n"
+ "smlal v8.4s, v26.4h, v1.4h\n"
+ "ldr x11, [x25, #0xa8]\n"
+ "add x11, x11, x10\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "smlal v16.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal v7.4s, v26.4h, v2.4h\n"
+ "smlal2 v17.4s, v26.8h, v2.8h\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v25.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v25.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[6], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[4], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v25.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[0], [x11]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d3, [x3, #0x68]\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "ldr x24, [x25, #0xb0]\n"
+ "add x24, x24, x10\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v18.4s, v27.8h, v3.8h\n"
+ "smlal v16.4s, v23.4h, v3.4h\n"
+ "smlal2 v21.4s, v23.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v24.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v24.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[6], [x24]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[4], [x24]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v24.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[2], [x24]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[0], [x24]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d4, [x3, #0x70]\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "ldr x0, [x25, #0xb8]\n"
+ "add x0, x0, x10\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v18.4s, v23.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v22.s }[0], [x0], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v22.h }[2], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[6], [x0]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[4], [x0]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v22.h }[0], [x0], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[2], [x0]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[0], [x0]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr d0, [x3, #0x78]\n"
+ "smlal v8.4s, v22.4h, v4.4h\n"
+ "ldr x15, [x25, #0xc0]\n"
+ "add x15, x15, x10\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v16.4s, v30.4h, v0.4h\n"
+ "smlal2 v21.4s, v30.8h, v0.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v27.s }[0], [x15], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v27.h }[2], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[6], [x15]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[4], [x15]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v27.h }[0], [x15], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[2], [x15]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[0], [x15]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr x9, [x25, #0xc8]\n"
+ "smlal v7.4s, v27.4h, v0.4h\n"
+ "add x9, x9, x10\n"
+ "smlal2 v17.4s, v27.8h, v0.8h\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v23.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[6], [x9]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[4], [x9]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v23.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[2], [x9]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[0], [x9]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d1, [x3, #0x80]\n"
+ "smlal v8.4s, v23.4h, v0.4h\n"
+ "ldr x27, [x25, #0xd0]\n"
+ "add x27, x27, x10\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
+ "smlal2 v18.4s, v30.8h, v1.8h\n"
+ "smlal v16.4s, v26.4h, v1.4h\n"
+ "smlal2 v21.4s, v26.8h, v1.8h\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal2 v17.4s, v23.8h, v1.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[6], [x27]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[2], [x27]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[0], [x27]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr d2, [x3, #0x88]\n"
+ "smlal v8.4s, v31.4h, v1.4h\n"
+ "ldr x28, [x25, #0xd8]\n"
+ "add x28, x28, x10\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v18.4s, v26.8h, v2.8h\n"
+ "smlal v16.4s, v25.4h, v2.4h\n"
+ "smlal2 v21.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v30.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v30.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[6], [x28]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[4], [x28]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v30.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[2], [x28]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[0], [x28]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d3, [x3, #0x90]\n"
+ "smlal v8.4s, v30.4h, v2.4h\n"
+ "ldr x12, [x25, #0xe0]\n"
+ "add x12, x12, x10\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v18.4s, v25.8h, v3.8h\n"
+ "smlal v16.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v24.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 85f\n"
+ "ld1 { v28.s }[0], [x12], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "ld1 { v28.h }[2], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[6], [x12]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[4], [x12]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "ld1 { v28.h }[0], [x12], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[2], [x12]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[0], [x12]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d4, [x3, #0x98]\n"
+ "smlal v8.4s, v28.4h, v3.4h\n"
+ "ldr x7, [x25, #0xe8]\n"
+ "add x7, x7, x10\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v18.4s, v24.8h, v4.8h\n"
+ "smlal v16.4s, v22.4h, v4.4h\n"
+ "smlal2 v21.4s, v22.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v17.4s, v28.8h, v4.8h\n"
+ "tbz x4, #2, 89f\n"
+ "ld1 { v26.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 88f\n"
+ "ld1 { v26.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[6], [x7]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[4], [x7]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x4, #1, 90f\n"
+ "ld1 { v26.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[2], [x7]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[0], [x7]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "ldr d0, [x3, #0xa0]\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "ldr x26, [x25, #0xf0]\n"
+ "add x26, x26, x10\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v15.4s, v27.4h, v0.4h\n"
+ "smlal2 v18.4s, v27.8h, v0.8h\n"
+ "smlal v16.4s, v23.4h, v0.4h\n"
+ "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "tbz x4, #2, 93f\n"
+ "ld1 { v25.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 92f\n"
+ "ld1 { v25.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[6], [x26]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[4], [x26]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x4, #1, 94f\n"
+ "ld1 { v25.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[2], [x26]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[0], [x26]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr x23, [x25, #0xf8]\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "add x23, x23, x10\n"
+ "smlal2 v17.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 97f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 96f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x4, #1, 98f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d1, [x3, #0xa8]\n"
+ "smlal v8.4s, v24.4h, v0.4h\n"
+ "ldr x22, [x25, #0x100]\n"
+ "add x22, x22, x10\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v15.4s, v23.4h, v1.4h\n"
+ "smlal2 v18.4s, v23.8h, v1.8h\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v21.4s, v31.8h, v1.8h\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v17.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 101f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 100f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x4, #1, 102f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d2, [x3, #0xb0]\n"
+ "smlal v8.4s, v27.4h, v1.4h\n"
+ "ldr x20, [x25, #0x108]\n"
+ "add x20, x20, x10\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v15.4s, v31.4h, v2.4h\n"
+ "smlal2 v18.4s, v31.8h, v2.8h\n"
+ "smlal v16.4s, v30.4h, v2.4h\n"
+ "smlal2 v21.4s, v30.8h, v2.8h\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "tbz x4, #2, 105f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 104f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x4, #1, 106f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d3, [x3, #0xb8]\n"
+ "smlal v8.4s, v25.4h, v2.4h\n"
+ "ldr x13, [x25, #0x110]\n"
+ "add x13, x13, x10\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v15.4s, v30.4h, v3.4h\n"
+ "smlal2 v18.4s, v30.8h, v3.8h\n"
+ "smlal v16.4s, v28.4h, v3.4h\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 109f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 108f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[6], [x13]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[4], [x13]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x4, #1, 110f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[2], [x13]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[0], [x13]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d4, [x3, #0xc0]\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "ldr x21, [x25, #0x118]\n"
+ "add x21, x21, x10\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v15.4s, v28.4h, v4.4h\n"
+ "smlal2 v18.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "smlal v7.4s, v24.4h, v4.4h\n"
+ "smlal2 v17.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 113f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x4, #1, 112f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x4, #1, 114f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "tbz x4, #2, 117f\n"
+ "ld1 { v6.4s }, [x2], #0x10\n"
+ "ld1 { v19.4s }, [x5], #0x10\n"
+ "tbz x4, #1, 116f\n"
+ "ld1 { v20.d }[0], [x2], #0x8\n"
+ "ld1 { v12.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v20.s }[2], [x2]\n"
+ "ld1 { v12.s }[2], [x5]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v20.s }[0], [x2]\n"
+ "ld1 { v12.s }[0], [x5]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 118f\n"
+ "ld1 { v6.d }[0], [x2], #0x8\n"
+ "ld1 { v19.d }[0], [x5], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[2], [x2]\n"
+ "ld1 { v19.s }[2], [x5]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v6.s }[0], [x2]\n"
+ "ld1 { v19.s }[0], [x5]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "add x17, x17, x1\n"
+ "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+ "add x16, x16, x1\n"
+ "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+ "add x6, x6, x1\n"
+ "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+ "add x8, x8, x1\n"
+ "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+ "and v28.16b, v15.16b, v19.16b\n"
+ "and v26.16b, v18.16b, v12.16b\n"
+ "and v29.16b, v16.16b, v19.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v28.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "sqadd v16.4s, v16.4s, v29.4s\n"
+ "and v4.16b, v21.16b, v12.16b\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v12.4s\n"
+ "srshl v16.4s, v16.4s, v19.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "add v15.4s, v15.4s, v10.4s\n"
+ "add v18.4s, v18.4s, v10.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
+ "smin v15.4s, v15.4s, v13.4s\n"
+ "smin v18.4s, v18.4s, v13.4s\n"
+ "smin v16.4s, v16.4s, v13.4s\n"
+ "smax v15.4s, v15.4s, v11.4s\n"
+ "smax v18.4s, v18.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v11.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "uzp1 v15.16b, v15.16b, v18.16b\n"
+ "and v25.16b, v7.16b, v19.16b\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "srshl v21.4s, v21.4s, v12.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "sqadd v7.4s, v7.4s, v25.4s\n"
+ "and v31.16b, v17.16b, v12.16b\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "and v24.16b, v8.16b, v19.16b\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "smax v21.4s, v21.4s, v11.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "uzp1 v16.16b, v16.16b, v21.16b\n"
+ "add v7.4s, v7.4s, v10.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "sqadd v17.4s, v17.4s, v31.4s\n"
+ "smin v7.4s, v7.4s, v13.4s\n"
+ "sqadd v8.4s, v8.4s, v24.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+ "smax v7.4s, v7.4s, v11.4s\n"
+ "srshl v17.4s, v17.4s, v12.4s\n"
+ "srshl v8.4s, v8.4s, v19.4s\n"
+ "and v1.16b, v5.16b, v12.16b\n"
+ "add v17.4s, v17.4s, v10.4s\n"
+ "add v8.4s, v8.4s, v10.4s\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smin v17.4s, v17.4s, v13.4s\n"
+ "smin v8.4s, v8.4s, v13.4s\n"
+ "sqadd v5.4s, v5.4s, v1.4s\n"
+ "smax v17.4s, v17.4s, v11.4s\n"
+ "smax v8.4s, v8.4s, v11.4s\n"
+ "srshl v5.4s, v5.4s, v12.4s\n"
+ "uzp1 v7.16b, v7.16b, v17.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smax v5.4s, v5.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v5.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "tbz x4, #2, 121f\n"
+ "st1 { v15.s }[0], [x17], #0x4\n"
+ "st1 { v16.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x6], #0x4\n"
+ "st1 { v8.s }[0], [x8], #0x4\n"
+ "tbz x4, #1, 120f\n"
+ "st1 { v15.h }[2], [x17], #0x2\n"
+ "st1 { v16.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x6], #0x2\n"
+ "st1 { v8.h }[2], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[6], [x17], #0x1\n"
+ "st1 { v16.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x6], #0x1\n"
+ "st1 { v8.b }[6], [x8], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[4], [x17], #0x1\n"
+ "st1 { v16.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x6], #0x1\n"
+ "st1 { v8.b }[4], [x8], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 122f\n"
+ "st1 { v15.h }[0], [x17], #0x2\n"
+ "st1 { v16.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x6], #0x2\n"
+ "st1 { v8.h }[0], [x8], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[2], [x17], #0x1\n"
+ "st1 { v16.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x6], #0x1\n"
+ "st1 { v8.b }[2], [x8], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v15.b }[0], [x17], #0x1\n"
+ "st1 { v16.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x6], #0x1\n"
+ "st1 { v8.b }[0], [x8], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+
+ "124:" // End
+
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2bfeac0556
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1633639ad5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v12.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "ld1r { v10.16b }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v9.16b }, [x20]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "mov x11, #0x0\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x2\n"
+ "cbz x10, 6f\n"
+ "1:" // Channel loop
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q27, [%x[bias], x19]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v24.16b, v27.16b\n"
+ "ldr s4, [x9, x11]\n"
+ "mov v23.16b, v27.16b\n"
+ "mov v22.16b, v27.16b\n"
+ "ldr s3, [x28, x11]\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "ldr s2, [x27, x11]\n"
+ "mov v19.16b, v27.16b\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldr s4, [x9, x11]\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "ldr s3, [x28, x11]\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr s2, [x27, x11]\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "ldr s1, [x26, x11]\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "ldr s0, [x25, x11]\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "ldr s31, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "ldr s30, [x23, x11]\n"
+ "ldr s29, [x22, x11]\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "ldr x21, [x20], #0x8\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "ldr s28, [x21, x11]\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x19, x11, #0x2\n"
+ "ldr q6, [%x[rq_mul_ptr], x19]\n"
+ "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x27, x11]\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x26, x11]\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x24, x11]\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x23, x11]\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x22, x11]\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x21, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x20, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x19, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x10, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v27.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x19, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v27.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[2], [x19], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+
+ "9:" // Oddments: Load bias: Done
+ "mov v26.16b, v27.16b\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v25.16b, v27.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "mov v24.16b, v27.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v23.16b, v27.16b\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "mov v22.16b, v27.16b\n"
+ "add x28, x28, x11\n"
+ "mov v21.16b, v27.16b\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "mov v20.16b, v27.16b\n"
+ "add x27, x27, x11\n"
+ "mov v19.16b, v27.16b\n"
+ "ldr x21, [x20], #0x8\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "add x9, x9, x11\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "add x28, x28, x11\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "add x27, x27, x11\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "ldr x21, [x20], #0x8\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "add x26, x26, x11\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "ldr s16, [%x[params]], #0x4\n"
+ "add x25, x25, x11\n"
+ "ssubl v16.8h, v16.8b, v9.8b\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h4, [x9], #0x2\n"
+ "ldr h3, [x28], #0x2\n"
+ "ldr h2, [x27], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h31, [x24], #0x2\n"
+ "ldr h30, [x23], #0x2\n"
+ "ldr h29, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v4.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x28], #0x1\n"
+ "ld1 { v2.b }[2], [x27], #0x1\n"
+ "ld1 { v1.b }[2], [x26], #0x1\n"
+ "ld1 { v0.b }[2], [x25], #0x1\n"
+ "ld1 { v31.b }[2], [x24], #0x1\n"
+ "ld1 { v30.b }[2], [x23], #0x1\n"
+ "ld1 { v29.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ldr b4, [x9], #0x1\n"
+ "ldr b3, [x28], #0x1\n"
+ "ldr b2, [x27], #0x1\n"
+ "ldr b1, [x26], #0x1\n"
+ "ldr b0, [x25], #0x1\n"
+ "ldr b31, [x24], #0x1\n"
+ "ldr b30, [x23], #0x1\n"
+ "ldr b29, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "usubl v4.8h, v4.8b, v10.8b\n"
+ "subs x19, x19, #0x1\n"
+ "usubl v3.8h, v3.8b, v10.8b\n"
+ "usubl v2.8h, v2.8b, v10.8b\n"
+ "usubl v1.8h, v1.8b, v10.8b\n"
+ "usubl v0.8h, v0.8b, v10.8b\n"
+ "usubl v31.8h, v31.8b, v10.8b\n"
+ "usubl v30.8h, v30.8b, v10.8b\n"
+ "usubl v29.8h, v29.8b, v10.8b\n"
+ "usubl v28.8h, v28.8b, v10.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v27.4s, v4.4h, v16.4h\n"
+ "smlal v26.4s, v3.4h, v16.4h\n"
+ "smlal v25.4s, v2.4h, v16.4h\n"
+ "smlal v24.4s, v1.4h, v16.4h\n"
+ "smlal v23.4s, v0.4h, v16.4h\n"
+ "smlal v22.4s, v31.4h, v16.4h\n"
+ "smlal v21.4s, v30.4h, v16.4h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal v19.4s, v28.4h, v16.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v6.d }[0], [x21], #0x8\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v7.d }[0], [x19], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[2], [x21], #0x4\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v7.s }[2], [x19], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v6.s }[0], [x21], #0x4\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v7.s }[0], [x19], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v27.4s, v27.4s, v7.4s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "add x27, x27, x11\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "sshl v26.4s, v26.4s, v7.4s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "add x26, x26, x11\n"
+ "sshl v25.4s, v25.4s, v7.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "sshl v24.4s, v24.4s, v7.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x25, x25, x11\n"
+ "and v16.16b, v27.16b, v5.16b\n"
+ "add x24, x24, x11\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "add x23, x23, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+ "add x22, x22, x11\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ "add x21, x21, x11\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x20, x20, x11\n"
+ "and v18.16b, v26.16b, v5.16b\n"
+ "add x19, x19, x11\n"
+ "and v17.16b, v25.16b, v5.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v24.16b, v5.16b\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v27.4s, v27.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "smax v27.4s, v27.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v8.4s\n"
+ "add v25.4s, v25.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "smin v27.4s, v27.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v12.4s\n"
+ "smax v25.4s, v25.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smax v24.4s, v24.4s, v12.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sshl v23.4s, v23.4s, v7.4s\n"
+ "sshl v22.4s, v22.4s, v7.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v20.4s, v20.4s, v7.4s\n"
+ "and v17.16b, v23.16b, v5.16b\n"
+ "and v16.16b, v22.16b, v5.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "and v17.16b, v20.16b, v5.16b\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add v23.4s, v23.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "smax v23.4s, v23.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v8.4s\n"
+ "add v20.4s, v20.4s, v8.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v21.4s, v21.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v12.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+ "and v16.16b, v19.16b, v5.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v27.h }[0], [x27], #0x2\n"
+ "st1 { v26.h }[0], [x26], #0x2\n"
+ "st1 { v25.h }[0], [x25], #0x2\n"
+ "st1 { v24.h }[0], [x24], #0x2\n"
+ "st1 { v23.h }[0], [x23], #0x2\n"
+ "st1 { v22.h }[0], [x22], #0x2\n"
+ "st1 { v21.h }[0], [x21], #0x2\n"
+ "st1 { v20.h }[0], [x20], #0x2\n"
+ "st1 { v19.h }[0], [x19], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[2], [x27], #0x1\n"
+ "st1 { v26.b }[2], [x26], #0x1\n"
+ "st1 { v25.b }[2], [x25], #0x1\n"
+ "st1 { v24.b }[2], [x24], #0x1\n"
+ "st1 { v23.b }[2], [x23], #0x1\n"
+ "st1 { v22.b }[2], [x22], #0x1\n"
+ "st1 { v21.b }[2], [x21], #0x1\n"
+ "st1 { v20.b }[2], [x20], #0x1\n"
+ "st1 { v19.b }[2], [x19], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v27.b }[0], [x27], #0x1\n"
+ "st1 { v26.b }[0], [x26], #0x1\n"
+ "st1 { v25.b }[0], [x25], #0x1\n"
+ "st1 { v24.b }[0], [x24], #0x1\n"
+ "st1 { v23.b }[0], [x23], #0x1\n"
+ "st1 { v22.b }[0], [x22], #0x1\n"
+ "st1 { v21.b }[0], [x21], #0x1\n"
+ "st1 { v20.b }[0], [x20], #0x1\n"
+ "st1 { v19.b }[0], [x19], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+
+ "24:" // End
+
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..802030573e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 2; };
+
+ kern_type kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..152999dd1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const int8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x9, #0x0\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v13.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v12.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v11.16b }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v10.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v9.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v8.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v7.4s }, [x19]\n"
+ "lsr x28, %x[n_output_channels], #0x2\n"
+ "cbz x28, 9f\n"
+ "1:" // Output channel loop
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q16, [%x[bias], x19]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x19, x9, #0x2\n"
+ "ldr q8, [%x[rq_mul_ptr], x19]\n"
+ "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 7f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s6, [x19, x9]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "str s5, [x20, x9]\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s4, [x21, x9]\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s31, [x22, x9]\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s30, [x23, x9]\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s29, [x24, x9]\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s28, [x25, x9]\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s27, [x26, x9]\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x19, x9]\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x20, x9]\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x21, x9]\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s23, [x22, x9]\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x23, x9]\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s21, [x24, x9]\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s20, [x25, x9]\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s19, [x26, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x28, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v16.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x19, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v16.s }[0], [x19]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v6.16b, v16.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "mov v4.16b, v16.16b\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v16.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "mov v28.16b, v16.16b\n"
+ "mov v27.16b, v16.16b\n"
+ "mov v26.16b, v16.16b\n"
+ "mov v25.16b, v16.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "mov v23.16b, v16.16b\n"
+ "mov v22.16b, v16.16b\n"
+ "mov v21.16b, v16.16b\n"
+ "mov v20.16b, v16.16b\n"
+ "mov v19.16b, v16.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v9.d }[0], [x19], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "ld1 { v9.s }[2], [x19], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v7.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s17, [%x[weights]], #0x4\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "mov x19, %x[inptrs]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "lsr x20, %x[kernel_points], #0x1\n"
+ "ldr d3, [x25, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "cbz x20, 22f\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "ldr d1, [x25, #0x0]\n"
+ "subs x20, x20, #0x1\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "subs x20, x20, #0x1\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "ldr d1, [x25, #0x0]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "ldr d0, [x27, #0x0]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr s16, [%x[weights]], #0x4\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "ldp x25, x27, [x19], #0x10\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr s17, [%x[weights]], #0x4\n"
+ "smlal v6.4s, v16.4h, v1.h[0]\n"
+ "smlal v5.4s, v16.4h, v1.h[1]\n"
+ "smlal v4.4s, v16.4h, v1.h[2]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v31.4s, v16.4h, v1.h[3]\n"
+ "smlal v30.4s, v16.4h, v1.h[4]\n"
+ "smlal v29.4s, v16.4h, v1.h[5]\n"
+ "smlal v28.4s, v16.4h, v1.h[6]\n"
+ "smlal v27.4s, v16.4h, v1.h[7]\n"
+ "smlal v26.4s, v16.4h, v0.h[0]\n"
+ "smlal v25.4s, v16.4h, v0.h[1]\n"
+ "smlal v24.4s, v16.4h, v0.h[2]\n"
+ "smlal v23.4s, v16.4h, v0.h[3]\n"
+ "smlal v22.4s, v16.4h, v0.h[4]\n"
+ "smlal v21.4s, v16.4h, v0.h[5]\n"
+ "smlal v20.4s, v16.4h, v0.h[6]\n"
+ "smlal v19.4s, v16.4h, v0.h[7]\n"
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v6.4s, v17.4h, v3.h[0]\n"
+ "smlal v5.4s, v17.4h, v3.h[1]\n"
+ "smlal v4.4s, v17.4h, v3.h[2]\n"
+ "smlal v31.4s, v17.4h, v3.h[3]\n"
+ "smlal v30.4s, v17.4h, v3.h[4]\n"
+ "smlal v29.4s, v17.4h, v3.h[5]\n"
+ "smlal v28.4s, v17.4h, v3.h[6]\n"
+ "smlal v27.4s, v17.4h, v3.h[7]\n"
+ "smlal v26.4s, v17.4h, v2.h[0]\n"
+ "smlal v25.4s, v17.4h, v2.h[1]\n"
+ "smlal v24.4s, v17.4h, v2.h[2]\n"
+ "smlal v23.4s, v17.4h, v2.h[3]\n"
+ "smlal v22.4s, v17.4h, v2.h[4]\n"
+ "smlal v21.4s, v17.4h, v2.h[5]\n"
+ "smlal v20.4s, v17.4h, v2.h[6]\n"
+ "smlal v19.4s, v17.4h, v2.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v6.4s, v6.4s, v9.4s\n"
+ "sshl v5.4s, v5.4s, v9.4s\n"
+ "sshl v4.4s, v4.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+ "sshl v31.4s, v31.4s, v9.4s\n"
+ "and v18.16b, v6.16b, v7.16b\n"
+ "and v16.16b, v5.16b, v7.16b\n"
+ "and v17.16b, v4.16b, v7.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "sqadd v4.4s, v4.4s, v17.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v7.4s\n"
+ "srshl v5.4s, v5.4s, v7.4s\n"
+ "srshl v4.4s, v4.4s, v7.4s\n"
+ "and v16.16b, v31.16b, v7.16b\n"
+ "add v6.4s, v6.4s, v10.4s\n"
+ "add v5.4s, v5.4s, v10.4s\n"
+ "add v4.4s, v4.4s, v10.4s\n"
+ "smin v6.4s, v6.4s, v13.4s\n"
+ "smin v5.4s, v5.4s, v13.4s\n"
+ "smin v4.4s, v4.4s, v13.4s\n"
+ "smax v6.4s, v6.4s, v14.4s\n"
+ "smax v5.4s, v5.4s, v14.4s\n"
+ "smax v4.4s, v4.4s, v14.4s\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "sshl v30.4s, v30.4s, v9.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sshl v29.4s, v29.4s, v9.4s\n"
+ "sshl v28.4s, v28.4s, v9.4s\n"
+ "srshl v31.4s, v31.4s, v7.4s\n"
+ "and v16.16b, v30.16b, v7.16b\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "add v31.4s, v31.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v29.16b, v7.16b\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "and v16.16b, v28.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v7.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v30.4s, v30.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "sshl v27.4s, v27.4s, v9.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v10.4s\n"
+ "srshl v28.4s, v28.4s, v7.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "add v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "sshl v26.4s, v26.4s, v9.4s\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "and v16.16b, v27.16b, v7.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "and v17.16b, v26.16b, v7.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshl v25.4s, v25.4s, v9.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v7.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sshl v24.4s, v24.4s, v9.4s\n"
+ "and v16.16b, v25.16b, v7.16b\n"
+ "add v27.4s, v27.4s, v10.4s\n"
+ "srshl v26.4s, v26.4s, v7.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "and v17.16b, v24.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "srshl v25.4s, v25.4s, v7.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "sshl v23.4s, v23.4s, v9.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "add v25.4s, v25.4s, v10.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "sshl v22.4s, v22.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v7.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "and v16.16b, v23.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "add v24.4s, v24.4s, v10.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "and v17.16b, v22.16b, v7.16b\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshl v21.4s, v21.4s, v9.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "srshl v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v9.4s\n"
+ "srshl v22.4s, v22.4s, v7.4s\n"
+ "smin v23.4s, v23.4s, v13.4s\n"
+ "and v16.16b, v21.16b, v7.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v22.4s, v22.4s, v13.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v14.4s\n"
+ "and v16.16b, v20.16b, v7.16b\n"
+ "sshl v19.4s, v19.4s, v9.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "srshl v21.4s, v21.4s, v7.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+ "add v21.4s, v21.4s, v10.4s\n"
+ "sqadd v20.4s, v20.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v13.4s\n"
+ "and v16.16b, v19.16b, v7.16b\n"
+ "srshl v20.4s, v20.4s, v7.4s\n"
+ "smax v21.4s, v21.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "add v20.4s, v20.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smin v20.4s, v20.4s, v13.4s\n"
+ "srshl v19.4s, v19.4s, v7.4s\n"
+ "smax v20.4s, v20.4s, v14.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "add v19.4s, v19.4s, v10.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "smin v19.4s, v19.4s, v13.4s\n"
+ "smax v19.4s, v19.4s, v14.4s\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.h }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.h }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.h }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.h }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.h }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.h }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.h }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.h }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.h }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.h }[0], [x25]\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v19.h }[0], [x26]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[2], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[2], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[2], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[2], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[2], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[2], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[2], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[2], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[2], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[2], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[2], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v19.b }[2], [x26]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "add x19, x19, x9\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "add x20, x20, x9\n"
+ "st1 { v6.b }[0], [x19]\n"
+ "add x21, x21, x9\n"
+ "st1 { v5.b }[0], [x20]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "add x22, x22, x9\n"
+ "st1 { v4.b }[0], [x21]\n"
+ "add x23, x23, x9\n"
+ "st1 { v31.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "add x24, x24, x9\n"
+ "st1 { v30.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "add x25, x25, x9\n"
+ "st1 { v29.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "add x26, x26, x9\n"
+ "st1 { v28.b }[0], [x25]\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "add x19, x19, x9\n"
+ "st1 { v27.b }[0], [x26]\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "add x20, x20, x9\n"
+ "st1 { v26.b }[0], [x19]\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "add x21, x21, x9\n"
+ "st1 { v25.b }[0], [x20]\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "add x22, x22, x9\n"
+ "st1 { v24.b }[0], [x21]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v23.b }[0], [x22]\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "add x24, x24, x9\n"
+ "st1 { v22.b }[0], [x23]\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "add x25, x25, x9\n"
+ "st1 { v21.b }[0], [x24]\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "add x26, x26, x9\n"
+ "st1 { v20.b }[0], [x25]\n"
+ "st1 { v19.b }[0], [x26]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+
+ "26:" // Done
+
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c444472c68
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..b788c705e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x15, #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x13, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cnth x12\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x21, XZR, x12\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x17, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x16, x11, x19\n" // offset += tile_j * ld_input_col
+ "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x10, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x27, x10, x22, LSL #1\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x26, x27, x22, LSL #1\n"
+ "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias
+ "add x25, x26, x22, LSL #1\n"
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias
+ "add x24, x11, x11\n"
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias
+ "add x23, x24, x11\n"
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias
+ "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+ "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias
+ "madd x19, x16, x9, x19\n" // offset += tile_j * ld_output_col
+ "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias
+ "mul x19, x19, x15\n" // offset *= output_tile_size
+ "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias
+ "add x28, x28, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z9.h }, p2/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1)
+ "ld1h { z10.h }, p2/Z, [x10]\n" // Load input point (0, 0)
+ "add x22, x28, x20, LSL #1\n"
+ "ld1h { z11.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3)
+ "addvl x14, x14, #16\n"
+ "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2)
+ "cmp x12, %x[n_channels]\n"
+ "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x14, x14, #-6\n"
+ "ld1h { z13.h }, p2/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1)
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias
+ "whilelt p1.h, x12, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "inch x21\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0)
+ "inch x13\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2)
+ "inch x12\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3)
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z31.h, p3/M, z5.h, z12.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1)
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2)
+ "addvl x10, x10, #1\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0)
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3)
+ "addvl x27, x27, #1\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0)
+ "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3)
+ "addvl x26, x26, #1\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z13.h }, p1/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1)
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1)
+ "fmla z28.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2)
+ "whilelt p2.h, x13, %x[n_channels]\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias
+ "addvl x25, x25, #1\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1)
+ "cmp x12, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x10]\n" // Load input point (0, 0)
+ "fmla z28.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3)
+ "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "addvl x14, x14, #16\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2)
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias
+ "addvl x14, x14, #-6\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0)
+ "mov z31.d, z16.d\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1)
+ "mov z30.d, z16.d\n"
+ "addvl x28, x28, #1\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "mov z29.d, z16.d\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1)
+ "mov z28.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x21, x17, #0x1\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0)
+ "add x16, x16, #0x1\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2)
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3)
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "cmp x16, x19\n"
+ "fmla z31.h, p3/M, z5.h, z12.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "csel x16, x16, XZR, LT\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1)
+ "csel x17, x17, x21, LT\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2)
+ "cmp x17, x20\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3)
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0)
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0)
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3)
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1)
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2)
+ "fmla z28.h, p3/M, z5.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0)
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1)
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1)
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..d8f905b33a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[2];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[7];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[10];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[12];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x3, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mov x5, #0x0\n"
+ "ldp x6, x7, [x19, #0x0]\n"
+ "cnth x8\n"
+ "ldp x17, x16, [x19, #0x10]\n"
+ "sub x15, XZR, x8\n"
+ "ldp x14, x13, [x19, #0x20]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ldp x12, x11, [x19, #0x30]\n"
+ "cmp x8, %x[n_channels]\n"
+ "ldp x10, x9, [x19, #0x40]\n"
+ "ldp x28, x27, [x19, #0x50]\n"
+ "ldp x26, x25, [x19, #0x60]\n"
+ "ldp x24, x23, [x19, #0x70]\n"
+ "ldp x22, x21, [x3, #0x0]\n"
+ "ldp x20, x19, [x3, #0x10]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+ "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+ "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+ "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #16\n"
+ "ld1h { z9.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x6, x5, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x9, x5, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias
+ "whilelt p1.h, x8, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "inch x15\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z31.h, p3/M, z5.h, z12.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x9, x8, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+ "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "inch x5\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+ "whilelt p2.h, x5, %x[n_channels]\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x13, x8, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x16, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x6, x8, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "addvl x4, x4, #16\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x12, x8, LSL #1]\n"
+ "inch x8\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+ "cmp x8, %x[n_channels]\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #-6\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+ "mov z30.d, z16.d\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+ "mov z29.d, z16.d\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+ "mov z28.d, z16.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "inch x15\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z31.h, p3/M, z5.h, z12.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f5d31e63f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..aebf0bf7ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x6, #0x0\n"
+ "mov x7, #0x0\n"
+ "1:" // Tile loop
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x24, #0x3\n"
+ "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x23, #0x3\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x17, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cnth x16\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x21, XZR, x16\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x6, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x7, x15, x19\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x14, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x11, x14, x22, LSL #1\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x10, x11, x22, LSL #1\n"
+ "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "add x9, x10, x22, LSL #1\n"
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "add x28, x9, x22, LSL #1\n"
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "add x27, x15, x15\n"
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "add x26, x27, x15\n"
+ "mov z27.d, z16.d\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "add x25, x26, x15\n"
+ "mov z26.d, z16.d\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+ "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+ "mov z25.d, z16.d\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+ "madd x19, x7, x13, x19\n" // offset += tile_j * ld_output_col
+ "mov z24.d, z16.d\n"
+ "mul x19, x19, x23\n" // offset *= output_tile_size
+ "mov z23.d, z16.d\n"
+ "add x12, x12, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x13, x13\n"
+ "add x23, x12, x20, LSL #1\n"
+ "add x22, x23, x20, LSL #1\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z9.h }, p2/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2)
+ "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (0, 0)
+ "addvl x8, x8, #16\n"
+ "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4)
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x8, x8, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+ "ld1h { z13.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2)
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "whilelt p1.h, x16, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "inch x21\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "inch x17\n"
+ "fmla z27.h, p3/M, z4.h, z9.h\n"
+ "inch x16\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "fmla z24.h, p3/M, z1.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4)
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z26.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1)
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3)
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0)
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4)
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2)
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4)
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1)
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3)
+ "fmla z23.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3)
+ "addvl x11, x11, #1\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z11.h\n"
+ "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2)
+ "addvl x14, x14, #1\n"
+ "fmla z24.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z10.h }, p1/Z, [x14]\n" // Load input point (0, 0)
+ "fmla z23.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3)
+ "addvl x9, x9, #1\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4)
+ "addvl x10, x10, #1\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z9.h }, p1/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2)
+ "fmla z26.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z5.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2)
+ "whilelt p2.h, x17, %x[n_channels]\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "addvl x28, x28, #1\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "cmp x16, %x[n_channels]\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x28]\n" // Load input point (4, 0)
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4)
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z25.h, p3/M, z8.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2)
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x8, x8, #16\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x8, x8, #-6\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0)
+ "mov z31.d, z16.d\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1)
+ "mov z30.d, z16.d\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2)
+ "mov z29.d, z16.d\n"
+ "addvl x12, x12, #1\n"
+ "fmax z27.h, p3/M, z27.h, z18.h\n"
+ "fmax z26.h, p3/M, z26.h, z18.h\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0)
+ "mov z28.d, z16.d\n"
+ "fmin z27.h, p3/M, z27.h, z17.h\n"
+ "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1)
+ "mov z27.d, z16.d\n"
+ "fmin z26.h, p3/M, z26.h, z17.h\n"
+ "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2)
+ "mov z26.d, z16.d\n"
+ "addvl x23, x23, #1\n"
+ "fmax z25.h, p3/M, z25.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z18.h\n"
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "fmin z25.h, p3/M, z25.h, z17.h\n"
+ "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0)
+ "mov z25.d, z16.d\n"
+ "fmin z24.h, p3/M, z24.h, z17.h\n"
+ "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1)
+ "mov z24.d, z16.d\n"
+ "fmin z23.h, p3/M, z23.h, z17.h\n"
+ "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2)
+ "mov z23.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x21, x6, #0x1\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x7, x7, #0x1\n"
+ "fmla z27.h, p3/M, z4.h, z9.h\n"
+ "cmp x7, x19\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "csel x7, x7, XZR, LT\n"
+ "fmla z24.h, p3/M, z1.h, z9.h\n"
+ "csel x6, x6, x21, LT\n"
+ "fmla z23.h, p3/M, z0.h, z9.h\n"
+ "cmp x6, x20\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4)
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z26.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1)
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3)
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0)
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4)
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2)
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4)
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1)
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3)
+ "fmla z23.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3)
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z11.h\n"
+ "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2)
+ "fmla z24.h, p3/M, z8.h, z13.h\n"
+ "fmla z23.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3)
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4)
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z5.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2)
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z8.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0)
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1)
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmax z27.h, p3/M, z27.h, z18.h\n"
+ "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2)
+ "fmax z26.h, p3/M, z26.h, z18.h\n"
+ "fmax z25.h, p3/M, z25.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z18.h\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0)
+ "fmin z27.h, p3/M, z27.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z17.h\n"
+ "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1)
+ "fmin z25.h, p3/M, z25.h, z17.h\n"
+ "fmin z24.h, p3/M, z24.h, z17.h\n"
+ "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2)
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0)
+ "fmin z23.h, p3/M, z23.h, z17.h\n"
+ "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1)
+ "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2)
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..65ecb6d218
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x6, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mov x17, #0x0\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cnth x16\n"
+ "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias
+ "sub x15, XZR, x16\n"
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias
+ "cmp x16, %x[n_channels]\n"
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias
+ "mov z27.d, z16.d\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias
+ "mov z26.d, z16.d\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias
+ "mov z25.d, z16.d\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x7, x7, #16\n"
+ "mov z24.d, z16.d\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias
+ "mov z23.d, z16.d\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x7, x7, #-6\n"
+ "ldp x14, x13, [x8, #0x0]\n"
+ "ldp x12, x11, [x8, #0x10]\n"
+ "ldr x10, [x8, #0x20]\n"
+ "ld1h { z9.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x9, [x8, #0x28]\n"
+ "whilelt p1.h, x16, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x28, [x8, #0x30]\n"
+ "inch x15\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x8, #0x38]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "fmla z27.h, p3/M, z4.h, z9.h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmla z24.h, p3/M, z1.h, z9.h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "fmla z23.h, p3/M, z0.h, z9.h\n"
+ "ldr x25, [x8, #0x68]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "ldr x24, [x8, #0x70]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "ldr x23, [x8, #0x78]\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ldr x14, [x8, #0x80]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ldr x13, [x8, #0x88]\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "ldr x12, [x8, #0x90]\n"
+ "fmla z26.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ldr x11, [x8, #0x98]\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "ldr x10, [x8, #0xa0]\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "ldr x9, [x8, #0xa8]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "ldr x28, [x8, #0xb0]\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "ldr x27, [x8, #0xb8]\n"
+ "fmla z24.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ldr x26, [x8, #0xc0]\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ldr x22, [x6, #0x0]\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "ldr x21, [x6, #0x8]\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "ldr x20, [x6, #0x10]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ldr x19, [x6, #0x18]\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ldp x14, x13, [x8, #0x0]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z10.h }, p1/Z, [x13, x16, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ldp x12, x11, [x8, #0x10]\n"
+ "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z13.h\n"
+ "ldr x10, [x8, #0x20]\n"
+ "fmla z23.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z5.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "inch x17\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias
+ "whilelt p2.h, x17, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x11, x16, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x12, x16, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x10, x16, LSL #1]\n"
+ "inch x16\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x7, x7, #16\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias
+ "cmp x16, %x[n_channels]\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x7, x7, #-6\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "ldr x22, [x6, #0x20]\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+ "mov z30.d, z16.d\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+ "mov z29.d, z16.d\n"
+ "ldr x21, [x6, #0x28]\n"
+ "fmax z27.h, p3/M, z27.h, z18.h\n"
+ "ldr x20, [x6, #0x30]\n"
+ "fmax z26.h, p3/M, z26.h, z18.h\n"
+ "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+ "mov z28.d, z16.d\n"
+ "ldr x19, [x6, #0x38]\n"
+ "fmax z25.h, p3/M, z25.h, z18.h\n"
+ "fmin z27.h, p3/M, z27.h, z17.h\n"
+ "st1h { z27.h }, p0, [x22, x15, LSL #1]\n"
+ "mov z27.d, z16.d\n"
+ "fmin z26.h, p3/M, z26.h, z17.h\n"
+ "ldr x22, [x6, #0x40]\n"
+ "fmin z25.h, p3/M, z25.h, z17.h\n"
+ "st1h { z26.h }, p0, [x21, x15, LSL #1]\n"
+ "mov z26.d, z16.d\n"
+ "fmax z24.h, p3/M, z24.h, z18.h\n"
+ "st1h { z25.h }, p0, [x20, x15, LSL #1]\n"
+ "mov z25.d, z16.d\n"
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "fmin z24.h, p3/M, z24.h, z17.h\n"
+ "st1h { z24.h }, p0, [x19, x15, LSL #1]\n"
+ "mov z24.d, z16.d\n"
+ "fmin z23.h, p3/M, z23.h, z17.h\n"
+ "st1h { z23.h }, p0, [x22, x15, LSL #1]\n"
+ "mov z23.d, z16.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x9, [x8, #0x28]\n"
+ "inch x15\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x28, [x8, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x8, #0x38]\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "fmla z27.h, p3/M, z4.h, z9.h\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x8, #0x50]\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmla z24.h, p3/M, z1.h, z9.h\n"
+ "ldr x19, [x8, #0x60]\n"
+ "fmla z23.h, p3/M, z0.h, z9.h\n"
+ "ldr x25, [x8, #0x68]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "ldr x24, [x8, #0x70]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "ldr x23, [x8, #0x78]\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ldr x14, [x8, #0x80]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ldr x13, [x8, #0x88]\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "ldr x12, [x8, #0x90]\n"
+ "fmla z26.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ldr x11, [x8, #0x98]\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "ldr x10, [x8, #0xa0]\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "ldr x9, [x8, #0xa8]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "ldr x28, [x8, #0xb0]\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "ldr x27, [x8, #0xb8]\n"
+ "fmla z24.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ldr x26, [x8, #0xc0]\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ldr x22, [x6, #0x0]\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "ldr x21, [x6, #0x8]\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "ldr x20, [x6, #0x10]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ldr x19, [x6, #0x18]\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z11.h\n"
+ "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z13.h\n"
+ "fmla z23.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z13.h\n"
+ "fmla z24.h, p3/M, z5.h, z13.h\n"
+ "fmla z23.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z8.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "ldr x22, [x6, #0x20]\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z18.h\n"
+ "fmax z26.h, p3/M, z26.h, z18.h\n"
+ "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "ldr x21, [x6, #0x28]\n"
+ "fmax z25.h, p3/M, z25.h, z18.h\n"
+ "ldr x20, [x6, #0x30]\n"
+ "fmax z24.h, p3/M, z24.h, z18.h\n"
+ "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z17.h\n"
+ "fmin z26.h, p3/M, z26.h, z17.h\n"
+ "ldr x19, [x6, #0x38]\n"
+ "fmin z25.h, p3/M, z25.h, z17.h\n"
+ "st1h { z27.h }, p0, [x22, x15, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z17.h\n"
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "st1h { z26.h }, p0, [x21, x15, LSL #1]\n"
+ "st1h { z25.h }, p0, [x20, x15, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z17.h\n"
+ "st1h { z24.h }, p0, [x19, x15, LSL #1]\n"
+ "ldr x22, [x6, #0x40]\n"
+ "st1h { z23.h }, p0, [x22, x15, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f976842b7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..8f0fce7e96
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x24, #0x4\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x23, #0x4\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x5, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cnth x6\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x21, XZR, x6\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x15, x8, x22, LSL #1\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x14, x15, x22, LSL #1\n"
+ "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias
+ "mov z31.d, z13.d\n"
+ "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+ "add x13, x14, x22, LSL #1\n"
+ "mov z30.d, z13.d\n"
+ "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+ "add x12, x13, x22, LSL #1\n"
+ "mov z29.d, z13.d\n"
+ "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+ "add x11, x12, x22, LSL #1\n"
+ "mov z28.d, z13.d\n"
+ "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+ "add x10, x7, x7\n"
+ "mov z27.d, z13.d\n"
+ "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+ "add x9, x10, x7\n"
+ "mov z26.d, z13.d\n"
+ "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+ "add x28, x9, x7\n"
+ "mov z25.d, z13.d\n"
+ "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+ "add x27, x28, x7\n"
+ "mov z24.d, z13.d\n"
+ "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
+ "mov z23.d, z13.d\n"
+ "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
+ "mov z22.d, z13.d\n"
+ "mul x19, x19, x23\n" // offset *= output_tile_size
+ "mov z21.d, z13.d\n"
+ "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "mov z20.d, z13.d\n"
+ "add x26, x16, x20, LSL #1\n"
+ "mov z19.d, z13.d\n"
+ "add x25, x26, x20, LSL #1\n"
+ "mov z18.d, z13.d\n"
+ "add x24, x25, x20, LSL #1\n"
+ "mov z17.d, z13.d\n"
+ "add x23, x17, x17\n"
+ "mov z16.d, z13.d\n"
+ "add x22, x23, x17\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z9.h }, p2/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2)
+ "ld1h { z10.h }, p2/Z, [x8]\n" // Load input point (0, 0)
+ "addvl x4, x4, #16\n"
+ "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5)
+ "cmp x6, %x[n_channels]\n"
+ "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3)
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias
+ "whilelt p1.h, x6, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "inch x21\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.h, p3/M, z5.h, z9.h\n"
+ "inch x5\n"
+ "fmla z26.h, p3/M, z4.h, z9.h\n"
+ "inch x6\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "fmla z22.h, p3/M, z1.h, z9.h\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2)
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0)
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5)
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z26.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "fmla z20.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1)
+ "fmla z19.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3)
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4)
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z7.h, z9.h\n"
+ "fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "fmla z21.h, p3/M, z3.h, z9.h\n"
+ "fmla z19.h, p3/M, z2.h, z9.h\n"
+ "fmla z18.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0)
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5)
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0)
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z21.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2)
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3)
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5)
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1)
+ "fmla z20.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1)
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4)
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4)
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z10.h\n"
+ "fmla z26.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2)
+ "fmla z17.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3)
+ "addvl x8, x8, #1\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0)
+ "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z18.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4)
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5)
+ "addvl x14, x14, #1\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2)
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0)
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z16.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2)
+ "fmla z28.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5)
+ "addvl x13, x13, #1\n"
+ "fmla z27.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z19.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2)
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3)
+ "fmla z24.h, p3/M, z8.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z16.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3)
+ "addvl x11, x11, #1\n"
+ "fmla z19.h, p3/M, z8.h, z10.h\n"
+ "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1)
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z17.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4)
+ "addvl x15, x15, #1\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z16.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4)
+ "whilelt p2.h, x5, %x[n_channels]\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+ "addvl x12, x12, #1\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "cmp x6, %x[n_channels]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5)
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3)
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x8]\n" // Load input point (0, 0)
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #16\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x4, x4, #-6\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0)
+ "mov z31.d, z13.d\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1)
+ "mov z30.d, z13.d\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2)
+ "mov z29.d, z13.d\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3)
+ "mov z28.d, z13.d\n"
+ "addvl x16, x16, #1\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0)
+ "mov z27.d, z13.d\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1)
+ "mov z26.d, z13.d\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2)
+ "mov z25.d, z13.d\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3)
+ "mov z24.d, z13.d\n"
+ "addvl x26, x26, #1\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0)
+ "mov z23.d, z13.d\n"
+ "fmin z22.h, p3/M, z22.h, z14.h\n"
+ "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1)
+ "mov z22.d, z13.d\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2)
+ "mov z21.d, z13.d\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z19.h, p3/M, z19.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3)
+ "mov z20.d, z13.d\n"
+ "addvl x25, x25, #1\n"
+ "fmin z19.h, p3/M, z19.h, z14.h\n"
+ "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0)
+ "mov z19.d, z13.d\n"
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1)
+ "mov z18.d, z13.d\n"
+ "fmin z17.h, p3/M, z17.h, z14.h\n"
+ "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2)
+ "mov z17.d, z13.d\n"
+ "fmax z16.h, p3/M, z16.h, z15.h\n"
+ "fmin z16.h, p3/M, z16.h, z14.h\n"
+ "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3)
+ "mov z16.d, z13.d\n"
+ "addvl x24, x24, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x21, x2, #0x1\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z26.h, p3/M, z4.h, z9.h\n"
+ "cmp x3, x19\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z22.h, p3/M, z1.h, z9.h\n"
+ "csel x2, x2, x21, LT\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2)
+ "cmp x2, x20\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0)
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5)
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z26.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "fmla z20.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1)
+ "fmla z19.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3)
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4)
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z7.h, z9.h\n"
+ "fmla z25.h, p3/M, z6.h, z9.h\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "fmla z21.h, p3/M, z3.h, z9.h\n"
+ "fmla z19.h, p3/M, z2.h, z9.h\n"
+ "fmla z18.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0)
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5)
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0)
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z21.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2)
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3)
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5)
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1)
+ "fmla z20.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1)
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4)
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4)
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z10.h\n"
+ "fmla z26.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2)
+ "fmla z17.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3)
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0)
+ "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z18.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4)
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5)
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0)
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z16.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2)
+ "fmla z28.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5)
+ "fmla z27.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z19.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2)
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3)
+ "fmla z24.h, p3/M, z8.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z16.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3)
+ "fmla z19.h, p3/M, z8.h, z10.h\n"
+ "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1)
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z17.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4)
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z16.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4)
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0)
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1)
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2)
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3)
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0)
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1)
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2)
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3)
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "fmin z22.h, p3/M, z22.h, z14.h\n"
+ "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0)
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1)
+ "fmax z19.h, p3/M, z19.h, z15.h\n"
+ "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2)
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3)
+ "fmin z19.h, p3/M, z19.h, z14.h\n"
+ "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0)
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "fmin z17.h, p3/M, z17.h, z14.h\n"
+ "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1)
+ "fmax z16.h, p3/M, z16.h, z15.h\n"
+ "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2)
+ "fmin z16.h, p3/M, z16.h, z14.h\n"
+ "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3)
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..8148353f1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,746 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x4, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mov x5, #0x0\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cnth x6\n"
+ "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias
+ "mov z31.d, z13.d\n"
+ "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias
+ "sub x7, XZR, x6\n"
+ "mov z30.d, z13.d\n"
+ "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "mov z29.d, z13.d\n"
+ "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias
+ "cmp x6, %x[n_channels]\n"
+ "mov z28.d, z13.d\n"
+ "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias
+ "mov z27.d, z13.d\n"
+ "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias
+ "mov z26.d, z13.d\n"
+ "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias
+ "mov z25.d, z13.d\n"
+ "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x3, x3, #16\n"
+ "mov z24.d, z13.d\n"
+ "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias
+ "mov z23.d, z13.d\n"
+ "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x3, x3, #-6\n"
+ "mov z22.d, z13.d\n"
+ "ldp x8, x17, [x4, #0x0]\n"
+ "mov z21.d, z13.d\n"
+ "ldp x16, x15, [x4, #0x10]\n"
+ "mov z20.d, z13.d\n"
+ "ld1h { z9.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "mov z19.d, z13.d\n"
+ "mov z18.d, z13.d\n"
+ "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "mov z17.d, z13.d\n"
+ "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "mov z16.d, z13.d\n"
+ "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x14, [x4, #0x20]\n"
+ "whilelt p1.h, x6, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x13, [x4, #0x28]\n"
+ "inch x7\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x12, [x4, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x11, [x4, #0x38]\n"
+ "fmla z26.h, p3/M, z4.h, z9.h\n"
+ "ldr x10, [x4, #0x40]\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "ldr x9, [x4, #0x48]\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x28, [x4, #0x50]\n"
+ "fmla z22.h, p3/M, z1.h, z9.h\n"
+ "ldr x27, [x4, #0x58]\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "ldr x26, [x4, #0x60]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "ldr x25, [x4, #0x68]\n"
+ "fmla z26.h, p3/M, z5.h, z12.h\n"
+ "ldr x24, [x4, #0x70]\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ldr x23, [x4, #0x78]\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "ldr x8, [x4, #0x80]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ldr x17, [x4, #0x88]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "ldr x16, [x4, #0x90]\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x15, [x4, #0x98]\n"
+ "fmla z20.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n"
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "fmla z26.h, p3/M, z7.h, z9.h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "fmla z25.h, p3/M, z6.h, z9.h\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "ldr x11, [x4, #0xb8]\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "ldr x10, [x4, #0xc0]\n"
+ "fmla z21.h, p3/M, z3.h, z9.h\n"
+ "ldr x9, [x4, #0xc8]\n"
+ "fmla z19.h, p3/M, z2.h, z9.h\n"
+ "ldr x22, [x2, #0x0]\n"
+ "fmla z18.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x2, #0x8]\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x28, [x4, #0xd0]\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ldr x27, [x4, #0xd8]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "ldr x26, [x4, #0xe0]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "ldr x20, [x2, #0x10]\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "ldr x19, [x2, #0x18]\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias
+ "fmla z21.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x4, #0xe8]\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "ldr x23, [x4, #0xf8]\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "ldr x24, [x4, #0xf0]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z11.h\n"
+ "ldr x17, [x4, #0x108]\n"
+ "fmla z16.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ldr x8, [x4, #0x100]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "ldr x15, [x4, #0x118]\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "ldr x16, [x4, #0x110]\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z10.h\n"
+ "fmla z26.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z18.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z16.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z19.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z16.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z10.h\n"
+ "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z17.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ldp x8, x17, [x4, #0x0]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z9.h }, p1/Z, [x8, x6, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z16.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n"
+ "inch x5\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "ldp x16, x15, [x4, #0x10]\n"
+ "whilelt p2.h, x5, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x16, x6, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x15, x6, LSL #1]\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x17, x6, LSL #1]\n"
+ "inch x6\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias
+ "addvl x3, x3, #16\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias
+ "cmp x6, %x[n_channels]\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x3, x3, #-6\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z31.h }, p0, [x22, x7, LSL #1]\n"
+ "mov z31.d, z13.d\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "ldr x22, [x2, #0x20]\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "st1h { z30.h }, p0, [x21, x7, LSL #1]\n"
+ "mov z30.d, z13.d\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "st1h { z29.h }, p0, [x20, x7, LSL #1]\n"
+ "mov z29.d, z13.d\n"
+ "ldr x21, [x2, #0x28]\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "ldr x20, [x2, #0x30]\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "st1h { z28.h }, p0, [x19, x7, LSL #1]\n"
+ "mov z28.d, z13.d\n"
+ "ldr x19, [x2, #0x38]\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "st1h { z27.h }, p0, [x22, x7, LSL #1]\n"
+ "mov z27.d, z13.d\n"
+ "ldr x22, [x2, #0x40]\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "st1h { z26.h }, p0, [x21, x7, LSL #1]\n"
+ "mov z26.d, z13.d\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "ldr x21, [x2, #0x48]\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "st1h { z25.h }, p0, [x20, x7, LSL #1]\n"
+ "mov z25.d, z13.d\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "ldr x20, [x2, #0x50]\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z24.h }, p0, [x19, x7, LSL #1]\n"
+ "mov z24.d, z13.d\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "ldr x19, [x2, #0x58]\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "st1h { z23.h }, p0, [x22, x7, LSL #1]\n"
+ "mov z23.d, z13.d\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "ldr x22, [x2, #0x60]\n"
+ "fmin z22.h, p3/M, z22.h, z14.h\n"
+ "st1h { z22.h }, p0, [x21, x7, LSL #1]\n"
+ "mov z22.d, z13.d\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "ldr x21, [x2, #0x68]\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "st1h { z21.h }, p0, [x20, x7, LSL #1]\n"
+ "mov z21.d, z13.d\n"
+ "fmax z19.h, p3/M, z19.h, z15.h\n"
+ "ldr x20, [x2, #0x70]\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "st1h { z20.h }, p0, [x19, x7, LSL #1]\n"
+ "mov z20.d, z13.d\n"
+ "fmin z19.h, p3/M, z19.h, z14.h\n"
+ "ldr x19, [x2, #0x78]\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "st1h { z19.h }, p0, [x22, x7, LSL #1]\n"
+ "mov z19.d, z13.d\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "st1h { z18.h }, p0, [x21, x7, LSL #1]\n"
+ "mov z18.d, z13.d\n"
+ "fmin z17.h, p3/M, z17.h, z14.h\n"
+ "st1h { z17.h }, p0, [x20, x7, LSL #1]\n"
+ "mov z17.d, z13.d\n"
+ "fmax z16.h, p3/M, z16.h, z15.h\n"
+ "fmin z16.h, p3/M, z16.h, z14.h\n"
+ "st1h { z16.h }, p0, [x19, x7, LSL #1]\n"
+ "mov z16.d, z13.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x14, [x4, #0x20]\n"
+ "inch x7\n"
+ "fmla z30.h, p3/M, z7.h, z9.h\n"
+ "ldr x13, [x4, #0x28]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x12, [x4, #0x30]\n"
+ "fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x11, [x4, #0x38]\n"
+ "fmla z26.h, p3/M, z4.h, z9.h\n"
+ "ldr x10, [x4, #0x40]\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "ldr x9, [x4, #0x48]\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x28, [x4, #0x50]\n"
+ "fmla z22.h, p3/M, z1.h, z9.h\n"
+ "ldr x27, [x4, #0x58]\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "ldr x26, [x4, #0x60]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "ldr x25, [x4, #0x68]\n"
+ "fmla z26.h, p3/M, z5.h, z12.h\n"
+ "ldr x24, [x4, #0x70]\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ldr x23, [x4, #0x78]\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "ldr x8, [x4, #0x80]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "ldr x17, [x4, #0x88]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "ldr x16, [x4, #0x90]\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x15, [x4, #0x98]\n"
+ "fmla z20.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n"
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "ldr x14, [x4, #0xa0]\n"
+ "fmla z26.h, p3/M, z7.h, z9.h\n"
+ "ldr x13, [x4, #0xa8]\n"
+ "fmla z25.h, p3/M, z6.h, z9.h\n"
+ "ldr x12, [x4, #0xb0]\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "ldr x11, [x4, #0xb8]\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "ldr x10, [x4, #0xc0]\n"
+ "fmla z21.h, p3/M, z3.h, z9.h\n"
+ "ldr x9, [x4, #0xc8]\n"
+ "fmla z19.h, p3/M, z2.h, z9.h\n"
+ "ldr x22, [x2, #0x0]\n"
+ "fmla z18.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x2, #0x8]\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x28, [x4, #0xd0]\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ldr x27, [x4, #0xd8]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "ldr x26, [x4, #0xe0]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "ldr x20, [x2, #0x10]\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "ldr x19, [x2, #0x18]\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z21.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x4, #0xe8]\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "ldr x23, [x4, #0xf8]\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "ldr x24, [x4, #0xf0]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z11.h\n"
+ "ldr x17, [x4, #0x108]\n"
+ "fmla z16.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ldr x8, [x4, #0x100]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "ldr x15, [x4, #0x118]\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "ldr x16, [x4, #0x110]\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z10.h\n"
+ "fmla z26.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "fmla z18.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z16.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z19.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z16.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z10.h\n"
+ "fmla z18.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z17.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z16.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "st1h { z31.h }, p0, [x22, x7, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "ldr x22, [x2, #0x20]\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "st1h { z30.h }, p0, [x21, x7, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "st1h { z29.h }, p0, [x20, x7, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "st1h { z28.h }, p0, [x19, x7, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "ldr x21, [x2, #0x28]\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "ldr x20, [x2, #0x30]\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "ldr x19, [x2, #0x38]\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "st1h { z27.h }, p0, [x22, x7, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z26.h }, p0, [x21, x7, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "ldr x22, [x2, #0x40]\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "ldr x21, [x2, #0x48]\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "st1h { z25.h }, p0, [x20, x7, LSL #1]\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "st1h { z24.h }, p0, [x19, x7, LSL #1]\n"
+ "fmax z19.h, p3/M, z19.h, z15.h\n"
+ "st1h { z23.h }, p0, [x22, x7, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z14.h\n"
+ "ldr x20, [x2, #0x50]\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "ldr x19, [x2, #0x58]\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "ldr x22, [x2, #0x60]\n"
+ "fmin z19.h, p3/M, z19.h, z14.h\n"
+ "st1h { z22.h }, p0, [x21, x7, LSL #1]\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "st1h { z21.h }, p0, [x20, x7, LSL #1]\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "st1h { z20.h }, p0, [x19, x7, LSL #1]\n"
+ "fmax z16.h, p3/M, z16.h, z15.h\n"
+ "st1h { z19.h }, p0, [x22, x7, LSL #1]\n"
+ "ldr x21, [x2, #0x68]\n"
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "ldr x20, [x2, #0x70]\n"
+ "fmin z17.h, p3/M, z17.h, z14.h\n"
+ "ldr x19, [x2, #0x78]\n"
+ "fmin z16.h, p3/M, z16.h, z14.h\n"
+ "st1h { z18.h }, p0, [x21, x7, LSL #1]\n"
+ "st1h { z17.h }, p0, [x20, x7, LSL #1]\n"
+ "st1h { z16.h }, p0, [x19, x7, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..98f50f8436
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..e620604a16
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x4\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x17, #0x2\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cnth x14\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x12, XZR, x14\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x8, x13, x19\n" // offset += tile_j * ld_input_col
+ "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
+ "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x21, x21, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x9, x21, x22, LSL #1\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x28, x9, x22, LSL #1\n"
+ "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+ "mov z31.d, z17.d\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+ "add x27, x28, x22, LSL #1\n"
+ "mov z30.d, z17.d\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+ "add x26, x27, x22, LSL #1\n"
+ "mov z29.d, z17.d\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+ "add x25, x13, x13\n"
+ "mov z28.d, z17.d\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+ "add x24, x25, x13\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+ "add x23, x24, x13\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+ "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+ "madd x19, x8, x11, x19\n" // offset += tile_j * ld_output_col
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z9.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2)
+ "ld1h { z10.h }, p2/Z, [x21]\n" // Load input point (0, 0)
+ "mul x19, x19, x17\n" // offset *= output_tile_size
+ "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1)
+ "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z12.h }, p2/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3)
+ "add x22, x10, x20, LSL #1\n"
+ "ld1h { z13.h }, p2/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4)
+ "addvl x16, x16, #16\n"
+ "ld1h { z14.h }, p2/Z, [x9]\n" // Load input point (1, 0)
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x16, x16, #-6\n"
+ "ld1h { z15.h }, p2/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1)
+ "ld1h { z16.h }, p2/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2)
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "inch x12\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "inch x15\n"
+ "addvl x21, x21, #1\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x21]\n" // Load input point (0, 0)
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4)
+ "inch x14\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3)
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2)
+ "addvl x9, x9, #1\n"
+ "fmla z31.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0)
+ "fmla z30.h, p3/M, z0.h, z16.h\n"
+ "fmla z29.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4)
+ "fmla z31.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0)
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z0.h, z15.h\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1)
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4)
+ "addvl x28, x28, #1\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z9.h }, p1/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2)
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3)
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0)
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2)
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2)
+ "addvl x27, x27, #1\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3)
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4)
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3)
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+ "fmin z31.h, p3/M, z31.h, z18.h\n"
+ "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0)
+ "mov z31.d, z17.d\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4)
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "fmla z29.h, p3/M, z8.h, z15.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+ "addvl x26, x26, #1\n"
+ "fmin z30.h, p3/M, z30.h, z18.h\n"
+ "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1)
+ "mov z30.d, z17.d\n"
+ "addvl x10, x10, #1\n"
+ "fmla z28.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2)
+ "cmp x14, %x[n_channels]\n"
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x9]\n" // Load input point (1, 0)
+ "fmin z29.h, p3/M, z29.h, z18.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "mov z29.d, z17.d\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p1/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1)
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1)
+ "addvl x16, x16, #16\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+ "fmin z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1)
+ "mov z28.d, z17.d\n"
+ "addvl x22, x22, #1\n"
+ "addvl x16, x16, #-6\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x21, x7, #0x1\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x8, x8, #0x1\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "cmp x8, x19\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4)
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3)
+ "csel x8, x8, XZR, LT\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2)
+ "csel x7, x7, x21, LT\n"
+ "fmla z31.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0)
+ "cmp x7, x20\n"
+ "fmla z30.h, p3/M, z0.h, z16.h\n"
+ "fmla z29.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4)
+ "fmla z31.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0)
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1)
+ "fmla z29.h, p3/M, z0.h, z15.h\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1)
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4)
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3)
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0)
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2)
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2)
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3)
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "fmin z31.h, p3/M, z31.h, z18.h\n"
+ "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0)
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z15.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4)
+ "fmin z30.h, p3/M, z30.h, z18.h\n"
+ "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1)
+ "fmla z28.h, p3/M, z3.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "fmla z28.h, p3/M, z7.h, z14.h\n"
+ "fmin z29.h, p3/M, z29.h, z18.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "fmin z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1)
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..3ed743e3ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mov x14, #0x0\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cnth x13\n"
+ "ldp x12, x11, [x19, #0x0]\n"
+ "sub x10, XZR, x13\n"
+ "ldp x9, x28, [x19, #0x10]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+ "mov z31.d, z17.d\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+ "cmp x13, %x[n_channels]\n"
+ "mov z30.d, z17.d\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+ "mov z29.d, z17.d\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+ "mov z28.d, z17.d\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x16, x16, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldp x25, x23, [x15, #0x10]\n"
+ "ldp x22, x21, [x15, #0x20]\n"
+ "ldp x20, x19, [x15, #0x30]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ldr x20, [x15, #0x48]\n"
+ "inch x10\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ldr x19, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x60]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z30.h, p3/M, z0.h, z16.h\n"
+ "ldr x27, [x15, #0x80]\n"
+ "fmla z29.h, p3/M, z3.h, z14.h\n"
+ "ldr x26, [x15, #0x88]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "fmla z31.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z15.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x23, [x15, #0x98]\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "ldr x21, [x15, #0xa8]\n"
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "ldr x19, [x15, #0xb8]\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ldr x24, [x15, #0xc0]\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "fmla z29.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "ldp x25, x23, [x15, #0x10]\n"
+ "ldp x22, x21, [x15, #0x20]\n"
+ "fmla z28.h, p3/M, z5.h, z14.h\n"
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x23, x13, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "fmin z31.h, p3/M, z31.h, z18.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "inch x14\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "fmin z30.h, p3/M, z30.h, z18.h\n"
+ "ldp x20, x19, [x15, #0x30]\n"
+ "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z3.h, z16.h\n"
+ "st1h { z31.h }, p0, [x12, x10, LSL #1]\n"
+ "mov z31.d, z17.d\n"
+ "ld1h { z16.h }, p1/Z, [x19, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z15.h\n"
+ "st1h { z30.h }, p0, [x11, x10, LSL #1]\n"
+ "mov z30.d, z17.d\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p1/Z, [x20, x13, LSL #1]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+ "fmin z29.h, p3/M, z29.h, z18.h\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
+ "inch x13\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "st1h { z29.h }, p0, [x9, x10, LSL #1]\n"
+ "cmp x13, %x[n_channels]\n"
+ "mov z29.d, z17.d\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x16, x16, #16\n"
+ "fmin z28.h, p3/M, z28.h, z18.h\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x16, x16, #-6\n"
+ "st1h { z28.h }, p0, [x28, x10, LSL #1]\n"
+ "mov z28.d, z17.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x24, [x15, #0x40]\n"
+ "inch x10\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ldr x20, [x15, #0x48]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ldr x19, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x60]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z30.h, p3/M, z0.h, z16.h\n"
+ "ldr x27, [x15, #0x80]\n"
+ "fmla z29.h, p3/M, z3.h, z14.h\n"
+ "ldr x26, [x15, #0x88]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "fmla z31.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z15.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x23, [x15, #0x98]\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "ldr x21, [x15, #0xa8]\n"
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "ldr x19, [x15, #0xb8]\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ldr x24, [x15, #0xc0]\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z18.h\n"
+ "st1h { z31.h }, p0, [x12, x10, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z18.h\n"
+ "st1h { z30.h }, p0, [x11, x10, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z14.h\n"
+ "fmla z29.h, p3/M, z8.h, z15.h\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmin z29.h, p3/M, z29.h, z18.h\n"
+ "st1h { z29.h }, p0, [x9, x10, LSL #1]\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "fmin z28.h, p3/M, z28.h, z18.h\n"
+ "st1h { z28.h }, p0, [x28, x10, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..20f3ee0329
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..f1ee5c53ce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x5, #0x0\n"
+ "mov x6, #0x0\n"
+ "1:" // Tile loop
+ "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x20, #0x2\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x7, #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x17, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cnth x16\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x14, XZR, x16\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x13, x13, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x20, x13, x22, LSL #1\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x10, x20, x22, LSL #1\n"
+ "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "add x9, x10, x22, LSL #1\n"
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "add x28, x9, x22, LSL #1\n"
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "add x27, x28, x22, LSL #1\n"
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "add x26, x15, x15\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "add x25, x26, x15\n"
+ "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
+ "add x24, x25, x15\n"
+ "add x23, x24, x15\n"
+ "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x7\n" // offset *= output_tile_size
+ "add x11, x11, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x22, x11, x21, LSL #1\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z5.h }, p2/Z, [x13]\n" // Load input point (0, 0)
+ "ld1h { z6.h }, p2/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1)
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z7.h }, p2/Z, [x20]\n" // Load input point (1, 0)
+ "addvl x8, x8, #6\n"
+ "ld1h { z8.h }, p2/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1)
+ "ld1h { z9.h }, p2/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2)
+ "ld1h { z13.h }, p2/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2)
+ "ld1h { z11.h }, p2/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3)
+ "ld1h { z12.h }, p2/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4)
+ "ld1h { z10.h }, p2/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5)
+ "ld1h { z14.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3)
+ "whilelt p1.h, x16, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "inch x14\n"
+ "fmla z29.h, p3/M, z0.h, z7.h\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "inch x17\n"
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4)
+ "addvl x20, x20, #1\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "inch x16\n"
+ "fmla z29.h, p3/M, z1.h, z8.h\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5)
+ "addvl x13, x13, #1\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z5.h\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2)
+ "fmla z30.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z4.h, z6.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z7.h\n"
+ "ld1h { z7.h }, p1/Z, [x20]\n" // Load input point (1, 0)
+ "fmla z30.h, p3/M, z0.h, z8.h\n"
+ "fmla z29.h, p3/M, z0.h, z14.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5)
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4)
+ "addvl x10, x10, #1\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x8, x8, #16\n"
+ "fmla z31.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+ "ld1h { z16.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2)
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5)
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z5.h\n"
+ "fmla z28.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3)
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z6.h\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4)
+ "addvl x9, x9, #1\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4)
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2)
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3)
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5)
+ "addvl x28, x28, #1\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z28.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0)
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z28.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1)
+ "fmla z30.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x10]\n" // Load input point (2, 0)
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2)
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3)
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2)
+ "fmla z30.h, p3/M, z1.h, z5.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4)
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z5.h }, p1/Z, [x13]\n" // Load input point (0, 0)
+ "fmla z30.h, p3/M, z2.h, z6.h\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5)
+ "whilelt p2.h, x17, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x27, x27, #1\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z6.h }, p1/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1)
+ "addvl x8, x8, #16\n"
+ "fmla z30.h, p3/M, z3.h, z8.h\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3)
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p1/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1)
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5)
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4)
+ "fmla z28.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2)
+ "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "addvl x8, x8, #-6\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0)
+ "mov z31.d, z16.d\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1)
+ "mov z30.d, z16.d\n"
+ "addvl x11, x11, #1\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "mov z29.d, z16.d\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1)
+ "mov z28.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3)
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x5, #0x1\n"
+ "fmla z29.h, p3/M, z0.h, z7.h\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "fmla z28.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "add x6, x6, #0x1\n"
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4)
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z29.h, p3/M, z1.h, z8.h\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x6, x19\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5)
+ "csel x6, x6, XZR, LT\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "csel x5, x5, x21, LT\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "cmp x5, x20\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z5.h\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2)
+ "fmla z30.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3)
+ "fmla z29.h, p3/M, z4.h, z6.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z7.h\n"
+ "fmla z30.h, p3/M, z0.h, z8.h\n"
+ "fmla z29.h, p3/M, z0.h, z14.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5)
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4)
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x8, x8, #16\n"
+ "fmla z31.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2)
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5)
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z5.h\n"
+ "fmla z28.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3)
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z6.h\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4)
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4)
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2)
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3)
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5)
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z28.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0)
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z28.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1)
+ "fmla z30.h, p3/M, z4.h, z14.h\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2)
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3)
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z1.h, z5.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4)
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "fmla z30.h, p3/M, z2.h, z6.h\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5)
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "fmla z30.h, p3/M, z3.h, z8.h\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z9.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0)
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1)
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+ "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1)
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..caa15a9816
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x6, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mov x7, #0x0\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cnth x8\n"
+ "ldp x17, x16, [x19, #0x0]\n"
+ "sub x15, XZR, x8\n"
+ "ldp x14, x13, [x19, #0x10]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z16.h }, p3/Z, [x5]\n" // Load from weights and bias
+ "mov z31.d, z16.d\n"
+ "ld1h { z0.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+ "cmp x8, %x[n_channels]\n"
+ "mov z30.d, z16.d\n"
+ "ld1h { z1.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+ "mov z29.d, z16.d\n"
+ "ld1h { z2.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+ "mov z28.d, z16.d\n"
+ "ld1h { z3.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+ "ld1h { z4.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+ "addvl x5, x5, #6\n"
+ "ldp x12, x11, [x6, #0x0]\n"
+ "ldp x10, x9, [x6, #0x10]\n"
+ "ldp x20, x28, [x6, #0x20]\n"
+ "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "ld1h { z6.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "ld1h { z7.h }, p2/Z, [x10, x7, LSL #1]\n"
+ "ld1h { z8.h }, p2/Z, [x9, x7, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "ldp x27, x19, [x6, #0x30]\n"
+ "ldp x26, x25, [x6, #0x40]\n"
+ "ld1h { z11.h }, p2/Z, [x27, x7, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x7, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x25, x7, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ldr x24, [x6, #0x50]\n"
+ "whilelt p1.h, x8, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "ldr x23, [x6, #0x58]\n"
+ "inch x15\n"
+ "fmla z29.h, p3/M, z0.h, z7.h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x6, #0x68]\n"
+ "fmla z29.h, p3/M, z1.h, z8.h\n"
+ "ldr x20, [x6, #0x70]\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ldr x19, [x6, #0x78]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ldr x12, [x6, #0x80]\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ldr x11, [x6, #0x88]\n"
+ "fmla z29.h, p3/M, z3.h, z5.h\n"
+ "ldr x10, [x6, #0x90]\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z6.h\n"
+ "ldr x9, [x6, #0x98]\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z7.h\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "fmla z30.h, p3/M, z0.h, z8.h\n"
+ "ldr x28, [x6, #0xa8]\n"
+ "fmla z29.h, p3/M, z0.h, z14.h\n"
+ "ldr x27, [x6, #0xb0]\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "ldr x19, [x6, #0xb8]\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "ldr x26, [x6, #0xc0]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "ldr x25, [x6, #0xc8]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x6, #0xd0]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z5.h\n"
+ "addvl x5, x5, #16\n"
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n"
+ "ldr x23, [x6, #0xd8]\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x6, #0xe0]\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "ldr x21, [x6, #0xe8]\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "ldr x20, [x6, #0xf0]\n"
+ "fmla z29.h, p3/M, z0.h, z5.h\n"
+ "ldr x19, [x6, #0xf8]\n"
+ "fmla z28.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ldr x12, [x6, #0x100]\n"
+ "fmla z29.h, p3/M, z1.h, z6.h\n"
+ "ldr x11, [x6, #0x108]\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ldr x10, [x6, #0x110]\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "ldr x9, [x6, #0x118]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z16.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z28.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z28.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z14.h\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "ldp x12, x11, [x6, #0x0]\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z1.h, z5.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z5.h }, p1/Z, [x12, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z6.h\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n"
+ "inch x7\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ldp x10, x9, [x6, #0x10]\n"
+ "whilelt p2.h, x7, %x[n_channels]\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z6.h }, p1/Z, [x11, x8, LSL #1]\n"
+ "ldp x20, x28, [x6, #0x20]\n"
+ "fmla z30.h, p3/M, z3.h, z8.h\n"
+ "ldp x27, x19, [x6, #0x30]\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z7.h }, p1/Z, [x10, x8, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z13.h }, p1/Z, [x28, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p1/Z, [x9, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z11.h }, p1/Z, [x27, x8, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x19, x8, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x20, x8, LSL #1]\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "ldp x26, x25, [x6, #0x40]\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "addvl x5, x5, #16\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "ld1h { z10.h }, p1/Z, [x26, x8, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x25, x8, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "inch x8\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+ "cmp x8, %x[n_channels]\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+ "addvl x5, x5, #-6\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z31.h }, p0, [x17, x15, LSL #1]\n"
+ "mov z31.d, z16.d\n"
+ "st1h { z30.h }, p0, [x16, x15, LSL #1]\n"
+ "mov z30.d, z16.d\n"
+ "st1h { z29.h }, p0, [x14, x15, LSL #1]\n"
+ "mov z29.d, z16.d\n"
+ "st1h { z28.h }, p0, [x13, x15, LSL #1]\n"
+ "mov z28.d, z16.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ldr x24, [x6, #0x50]\n"
+ "inch x15\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "ldr x23, [x6, #0x58]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.h, p3/M, z0.h, z7.h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "fmla z28.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x6, #0x68]\n"
+ "fmla z29.h, p3/M, z1.h, z8.h\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+ "ldr x20, [x6, #0x70]\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ldr x19, [x6, #0x78]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+ "ldr x12, [x6, #0x80]\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ldr x11, [x6, #0x88]\n"
+ "fmla z29.h, p3/M, z3.h, z5.h\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+ "ldr x10, [x6, #0x90]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z6.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+ "ldr x9, [x6, #0x98]\n"
+ "fmla z31.h, p3/M, z0.h, z7.h\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "fmla z30.h, p3/M, z0.h, z8.h\n"
+ "ldr x28, [x6, #0xa8]\n"
+ "fmla z29.h, p3/M, z0.h, z14.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+ "ldr x27, [x6, #0xb0]\n"
+ "fmla z31.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "ldr x19, [x6, #0xb8]\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+ "ldr x26, [x6, #0xc0]\n"
+ "fmla z31.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "ldr x25, [x6, #0xc8]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+ "addvl x5, x5, #16\n"
+ "fmla z31.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n"
+ "ldr x24, [x6, #0xd0]\n"
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "ldr x23, [x6, #0xd8]\n"
+ "fmla z29.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+ "ldr x22, [x6, #0xe0]\n"
+ "fmla z31.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+ "ldr x21, [x6, #0xe8]\n"
+ "fmla z31.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "ldr x20, [x6, #0xf0]\n"
+ "fmla z29.h, p3/M, z0.h, z5.h\n"
+ "fmla z28.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias
+ "ldr x19, [x6, #0xf8]\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ldr x12, [x6, #0x100]\n"
+ "fmla z29.h, p3/M, z1.h, z6.h\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias
+ "ldr x11, [x6, #0x108]\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ldr x10, [x6, #0x110]\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias
+ "ldr x9, [x6, #0x118]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z6.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z28.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z28.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z14.h\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+ "fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z0.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z1.h, z5.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "fmla z30.h, p3/M, z2.h, z6.h\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "fmla z30.h, p3/M, z3.h, z8.h\n"
+ "fmla z29.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z9.h\n"
+ "fmax z31.h, p3/M, z31.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z18.h\n"
+ "fmin z31.h, p3/M, z31.h, z17.h\n"
+ "st1h { z31.h }, p0, [x17, x15, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z17.h\n"
+ "st1h { z30.h }, p0, [x16, x15, LSL #1]\n"
+ "fmin z28.h, p3/M, z28.h, z17.h\n"
+ "st1h { z29.h }, p0, [x14, x15, LSL #1]\n"
+ "st1h { z28.h }, p0, [x13, x15, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..74716ddf1f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d443855758
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float *const inptrs[16] = {
+ input_ptrs[0], input_ptrs[1], input_ptrs[4], input_ptrs[5], input_ptrs[2], input_ptrs[6], input_ptrs[3], input_ptrs[7], input_ptrs[8], input_ptrs[9], input_ptrs[10], input_ptrs[11], input_ptrs[12], input_ptrs[13], input_ptrs[14], input_ptrs[15],
+ };
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ldp x26, x23, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x25, x16, [%x[inptrs], #0x10]\n"
+ "mov x15, #0x0\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+ "mov z14.d, z15.d\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "cntw x14\n"
+ "mov z12.d, z15.d\n"
+ "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sub x13, XZR, x14\n"
+ "mov z10.d, z15.d\n"
+ "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov z8.d, z15.d\n"
+ "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
+ "ldp x24, x12, [%x[inptrs], #0x20]\n"
+ "ldp x23, x11, [%x[inptrs], #0x30]\n"
+ "ldp x10, x9, [%x[inptrs], #0x40]\n"
+ "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
+ "ldp x28, x27, [%x[inptrs], #0x50]\n"
+ "ldp x26, x25, [%x[inptrs], #0x60]\n"
+ "ldp x24, x23, [%x[inptrs], #0x70]\n"
+ "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "ldp x22, x21, [%x[outptrs], #0x0]\n"
+ "ldp x20, x19, [%x[outptrs], #0x10]\n"
+ "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
+ "bge 1f\n"
+ "1:" // Loop
+ "fmla z14.s, p2/M, z13.s, z3.s\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+ "incw x13\n"
+ "fmla z12.s, p2/M, z13.s, z0.s\n"
+ "ldp x26, x23, [%x[inptrs], #0x0]\n"
+ "mov p0.b, p1.b\n"
+ "fmla z10.s, p2/M, z13.s, z31.s\n"
+ "ldp x25, x16, [%x[inptrs], #0x10]\n"
+ "mov x15, x14\n"
+ "fmla z8.s, p2/M, z13.s, z30.s\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "incw x14\n"
+ "fmla z14.s, p2/M, z11.s, z0.s\n"
+ "ldp x24, x12, [%x[inptrs], #0x20]\n"
+ "whilelt p1.s, x15, %x[n_channels]\n"
+ "fmla z12.s, p2/M, z11.s, z29.s\n"
+ "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "cmp x14, %x[n_channels]\n"
+ "fmla z10.s, p2/M, z11.s, z30.s\n"
+ "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "ldp x23, x11, [%x[inptrs], #0x30]\n"
+ "fmla z8.s, p2/M, z11.s, z28.s\n"
+ "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z14.s, p2/M, z9.s, z29.s\n"
+ "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z9.s, z27.s\n"
+ "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "fmla z10.s, p2/M, z9.s, z28.s\n"
+ "ldp x10, x9, [%x[inptrs], #0x40]\n"
+ "fmla z8.s, p2/M, z9.s, z26.s\n"
+ "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z14.s, p2/M, z7.s, z31.s\n"
+ "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z7.s, z30.s\n"
+ "ldp x28, x27, [%x[inptrs], #0x50]\n"
+ "fmla z10.s, p2/M, z7.s, z25.s\n"
+ "ldp x26, x25, [%x[inptrs], #0x60]\n"
+ "fmla z8.s, p2/M, z7.s, z24.s\n"
+ "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z14.s, p2/M, z6.s, z30.s\n"
+ "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z6.s, z28.s\n"
+ "ldp x24, x23, [%x[inptrs], #0x70]\n"
+ "fmla z10.s, p2/M, z6.s, z24.s\n"
+ "fmla z8.s, p2/M, z6.s, z23.s\n"
+ "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z14.s, p2/M, z5.s, z28.s\n"
+ "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z5.s, z26.s\n"
+ "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
+ "fmla z10.s, p2/M, z5.s, z23.s\n"
+ "fmla z8.s, p2/M, z5.s, z22.s\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z14.s, p2/M, z4.s, z25.s\n"
+ "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z4.s, z24.s\n"
+ "fmla z10.s, p2/M, z4.s, z21.s\n"
+ "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "fmla z8.s, p2/M, z4.s, z20.s\n"
+ "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmla z14.s, p2/M, z2.s, z24.s\n"
+ "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z2.s, z23.s\n"
+ "fmla z10.s, p2/M, z2.s, z20.s\n"
+ "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "fmla z8.s, p2/M, z2.s, z19.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z14.s, p2/M, z1.s, z23.s\n"
+ "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "fmla z12.s, p2/M, z1.s, z22.s\n"
+ "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "fmla z10.s, p2/M, z1.s, z19.s\n"
+ "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "fmla z8.s, p2/M, z1.s, z18.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "fmax z14.s, p2/M, z14.s, z17.s\n"
+ "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "fmax z12.s, p2/M, z12.s, z17.s\n"
+ "fmax z10.s, p2/M, z10.s, z17.s\n"
+ "fmax z8.s, p2/M, z8.s, z17.s\n"
+ "fmin z14.s, p2/M, z14.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
+ "mov z14.d, z15.d\n"
+ "fmin z12.s, p2/M, z12.s, z16.s\n"
+ "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
+ "mov z12.d, z15.d\n"
+ "fmin z10.s, p2/M, z10.s, z16.s\n"
+ "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
+ "mov z10.d, z15.d\n"
+ "fmin z8.s, p2/M, z8.s, z16.s\n"
+ "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
+ "mov z8.d, z15.d\n"
+ "blt 1b\n"
+ "2:" // Tail
+ "fmla z14.s, p2/M, z13.s, z3.s\n"
+ "incw x13\n"
+ "fmla z12.s, p2/M, z13.s, z0.s\n"
+ "mov p0.b, p1.b\n"
+ "fmla z10.s, p2/M, z13.s, z31.s\n"
+ "fmla z8.s, p2/M, z13.s, z30.s\n"
+ "fmla z14.s, p2/M, z11.s, z0.s\n"
+ "fmla z12.s, p2/M, z11.s, z29.s\n"
+ "fmla z10.s, p2/M, z11.s, z30.s\n"
+ "fmla z8.s, p2/M, z11.s, z28.s\n"
+ "fmla z14.s, p2/M, z9.s, z29.s\n"
+ "fmla z12.s, p2/M, z9.s, z27.s\n"
+ "fmla z10.s, p2/M, z9.s, z28.s\n"
+ "fmla z8.s, p2/M, z9.s, z26.s\n"
+ "fmla z14.s, p2/M, z7.s, z31.s\n"
+ "fmla z12.s, p2/M, z7.s, z30.s\n"
+ "fmla z10.s, p2/M, z7.s, z25.s\n"
+ "fmla z8.s, p2/M, z7.s, z24.s\n"
+ "fmla z14.s, p2/M, z6.s, z30.s\n"
+ "fmla z12.s, p2/M, z6.s, z28.s\n"
+ "fmla z10.s, p2/M, z6.s, z24.s\n"
+ "fmla z8.s, p2/M, z6.s, z23.s\n"
+ "fmla z14.s, p2/M, z5.s, z28.s\n"
+ "fmla z12.s, p2/M, z5.s, z26.s\n"
+ "fmla z10.s, p2/M, z5.s, z23.s\n"
+ "fmla z8.s, p2/M, z5.s, z22.s\n"
+ "fmla z14.s, p2/M, z4.s, z25.s\n"
+ "fmla z12.s, p2/M, z4.s, z24.s\n"
+ "fmla z10.s, p2/M, z4.s, z21.s\n"
+ "fmla z8.s, p2/M, z4.s, z20.s\n"
+ "fmla z14.s, p2/M, z2.s, z24.s\n"
+ "fmla z12.s, p2/M, z2.s, z23.s\n"
+ "fmla z10.s, p2/M, z2.s, z20.s\n"
+ "fmla z8.s, p2/M, z2.s, z19.s\n"
+ "fmla z14.s, p2/M, z1.s, z23.s\n"
+ "fmla z12.s, p2/M, z1.s, z22.s\n"
+ "fmla z10.s, p2/M, z1.s, z19.s\n"
+ "fmla z8.s, p2/M, z1.s, z18.s\n"
+ "fmax z14.s, p2/M, z14.s, z17.s\n"
+ "fmax z12.s, p2/M, z12.s, z17.s\n"
+ "fmax z10.s, p2/M, z10.s, z17.s\n"
+ "fmax z8.s, p2/M, z8.s, z17.s\n"
+ "fmin z14.s, p2/M, z14.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
+ "fmin z12.s, p2/M, z12.s, z16.s\n"
+ "fmin z10.s, p2/M, z10.s, z16.s\n"
+ "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
+ "fmin z8.s, p2/M, z8.s, z16.s\n"
+ "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
+ "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
+ : [params] "+r" (params)
+ : [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((unsigned long) n_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d899255e84
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x8, #0x0\n"
+ "mov x17, #0x0\n"
+ "1:" // Tile loop
+ "str x8, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x21, #0x2\n"
+ "str x17, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "cntb x16\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, x16, XZR, LSL #4\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cntb x14\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntb x12\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x8, x20\n" // offset = tile_i * ld_input_row
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x17, x13, x19\n" // offset += tile_j * ld_input_col
+ "ldr x10, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x11, x11, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x28, x11, x20, LSL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x27, x28, x20, LSL #2\n"
+ "ld1w { z16.s }, p3/Z, [x15]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+ "add x26, x27, x20, LSL #2\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
+ "add x25, x13, x13\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
+ "add x24, x25, x13\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
+ "add x14, x14, x13, LSL #4\n"
+ "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
+ "add x12, x12, x25, LSL #4\n"
+ "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
+ "cntb x23\n"
+ "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
+ "add x23, x23, x24, LSL #4\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "mov x20, #0x2\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "mul x19, x8, x22\n" // offset = tile_i * ld_output_row
+ "prfm pldl1keep, [x11, x23]\n"
+ "madd x19, x17, x10, x19\n" // offset += tile_j * ld_output_col
+ "prfm pldl1keep, [x28, x12]\n"
+ "mul x19, x19, x20\n" // offset *= output_tile_size
+ "prfm pldl1keep, [x27, x14]\n"
+ "add x9, x9, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "mov x21, #0x0\n"
+ "add x22, x9, x22, LSL #2\n"
+ "cntw x20\n"
+ "sub x19, XZR, x20\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z9.s }, p2/Z, [x28, x13, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11]\n"
+ "addvl x15, x15, #16\n"
+ "ld1w { z11.s }, p2/Z, [x11, x24, LSL #2]\n"
+ "cmp x20, %x[n_channels]\n"
+ "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
+ "addvl x15, x15, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "whilelt p1.s, x20, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "incw x19\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x11, x14]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26]\n"
+ "incw x21\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x11, x12]\n"
+ "incw x20\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x27, x12]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x28, x16]\n"
+ "fmla z31.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x28, x23]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "addvl x11, x11, #1\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x27, x16]\n"
+ "prfm pldl1keep, [x27, x23]\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x26, x12]\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z16.s }, p3/Z, [x15]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "addvl x28, x28, #1\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x11, x23]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "prfm pldl1keep, [x28, x12]\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "addvl x27, x27, #1\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z13.s }, p1/Z, [x27, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x27, x14]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p1/Z, [x28, x13, LSL #2]\n"
+ "cmp x20, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p1/Z, [x11]\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x11, x24, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "addvl x15, x15, #16\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
+ "addvl x15, x15, #-6\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z31.s }, p0, [x9]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "mov z30.d, z16.d\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "mov z29.d, z16.d\n"
+ "st1w { z28.s }, p0, [x22, x10, LSL #2]\n"
+ "mov z28.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x11, x14]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26]\n"
+ "prfm pldl1keep, [x11, x12]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x27, x12]\n"
+ "prfm pldl1keep, [x28, x16]\n"
+ "fmla z31.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x28, x23]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n"
+ "prfm pldl1keep, [x27, x16]\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x27, x23]\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28]\n"
+ "prfm pldl1keep, [x26, x12]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "add x21, x8, #0x1\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x17, x17, #0x1\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x17, x19\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "csel x17, x17, XZR, LT\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "csel x8, x8, x21, LT\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "cmp x8, x20\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x9]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x22, x10, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..e8a1539437
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[16];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[2];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[7];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[10];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[12];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntb x4, ALL, MUL #2\n"
+ "ldp x5, x6, [x19, #0x0]\n"
+ "mov x7, #0x0\n"
+ "ldp x8, x17, [x19, #0x10]\n"
+ "cntw x16\n"
+ "ldp x15, x14, [x19, #0x20]\n"
+ "sub x13, XZR, x16\n"
+ "ldp x12, x11, [x19, #0x30]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ldp x10, x9, [x19, #0x40]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ldp x28, x27, [x19, #0x50]\n"
+ "ldp x26, x25, [x19, #0x60]\n"
+ "ldp x24, x23, [x19, #0x70]\n"
+ "ldp x22, x21, [x2, #0x0]\n"
+ "ldp x20, x19, [x2, #0x10]\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z16.s }, p3/Z, [x3]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n"
+ "addvl x3, x3, #16\n"
+ "ld1w { z9.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n"
+ "addvl x3, x3, #-6\n"
+ "prfm pldl1keep, [x14, x4]\n"
+ "ld1w { z10.s }, p2/Z, [x5, x7, LSL #2]\n"
+ "prfm pldl1keep, [x5, x4]\n"
+ "ld1w { z11.s }, p2/Z, [x17, x7, LSL #2]\n"
+ "prfm pldl1keep, [x17, x4]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+ "prfm pldl1keep, [x12, x4]\n"
+ "ld1w { z13.s }, p2/Z, [x9, x7, LSL #2]\n"
+ "prfm pldl1keep, [x9, x4]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x26, x4]\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x23, x4]\n"
+ "incw x13\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x6, x4]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n"
+ "prfm pldl1keep, [x8, x4]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x28, x4]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x15, x4]\n"
+ "fmla z31.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x11, x4]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x10, x4]\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x27, x4]\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "prfm pldl1keep, [x25, x4]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x24, x4]\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "addvl x4, x4, #1\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "prfm pldl1keep, [x14, x4]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x5, x4]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x17, x4]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x12, x4]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z13.s }, p1/Z, [x9, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x9, x4]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z16.s }, p3/Z, [x3]\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "incw x7\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n"
+ "whilelt p2.s, x7, %x[n_channels]\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x17, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p1/Z, [x5, x16, LSL #2]\n"
+ "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x12, x16, LSL #2]\n"
+ "incw x16\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n"
+ "addvl x3, x3, #16\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n"
+ "addvl x3, x3, #-6\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z31.s }, p0, [x22, x13, LSL #2]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "mov z30.d, z16.d\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z29.s }, p0, [x20, x13, LSL #2]\n"
+ "mov z29.d, z16.d\n"
+ "st1w { z28.s }, p0, [x19, x13, LSL #2]\n"
+ "mov z28.d, z16.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x26, x4]\n"
+ "incw x13\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x23, x4]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x6, x4]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n"
+ "prfm pldl1keep, [x8, x4]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x28, x4]\n"
+ "prfm pldl1keep, [x15, x4]\n"
+ "fmla z31.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x11, x4]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "prfm pldl1keep, [x10, x4]\n"
+ "fmla z31.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x27, x4]\n"
+ "fmla z30.s, p3/M, z6.s, z13.s\n"
+ "prfm pldl1keep, [x25, x4]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "prfm pldl1keep, [x24, x4]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x22, x13, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "st1w { z29.s }, p0, [x20, x13, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x19, x13, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp
new file mode 100644
index 0000000000..173fc631d8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided
+{
+ typedef float bias_type;
+ typedef float operand_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ kern_type kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl;
+
+ sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp
new file mode 100644
index 0000000000..cecc192c49
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(
+ const float *const inptr,
+ const size_t in_row_stride,
+ const size_t in_col_stride,
+ float *const outptr,
+ const size_t out_row_stride,
+ const size_t out_col_stride,
+ const void *params,
+ unsigned long n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+ "mov z14.d, z15.d\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov z12.d, z15.d\n"
+ "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "mov x26, %x[inptr]\n"
+ "mov z10.d, z15.d\n"
+ "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "add x25, x26, %x[in_row_stride], LSL #2\n"
+ "mov z8.d, z15.d\n"
+ "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "add x24, x25, %x[in_row_stride], LSL #2\n"
+ "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "add x23, x24, %x[in_row_stride], LSL #2\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "mov x22, %x[outptr]\n"
+ "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add x21, x22, %x[out_row_stride], LSL #2\n"
+ "ld1w { z3.s }, p1/Z, [x26]\n"
+ "add x20, %x[in_col_stride], %x[in_col_stride]\n"
+ "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
+ "add x19, x20, %x[in_col_stride]\n"
+ "ld1w { z1.s }, p1/Z, [x25]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
+ "decw %x[n_channels]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "cmp %x[n_channels], XZR\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x24]\n"
+ "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
+ "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x23]\n"
+ "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
+ "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
+ "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
+ "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
+ "ble 2f\n"
+ "1:" // Loop
+ "fmla z14.s, p2/M, z13.s, z3.s\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z12.s, p2/M, z13.s, z2.s\n"
+ "addvl x25, x25, #1\n"
+ "fmla z10.s, p2/M, z13.s, z1.s\n"
+ "addvl x24, x24, #1\n"
+ "fmla z8.s, p2/M, z13.s, z0.s\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "addvl x23, x23, #1\n"
+ "fmla z14.s, p2/M, z11.s, z2.s\n"
+ "decw %x[n_channels]\n"
+ "mov p0.b, p1.b\n"
+ "fmla z12.s, p2/M, z11.s, z29.s\n"
+ "fmla z10.s, p2/M, z11.s, z0.s\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "ld1w { z3.s }, p1/Z, [x26]\n"
+ "fmla z8.s, p2/M, z11.s, z28.s\n"
+ "cmp %x[n_channels], XZR\n"
+ "fmla z14.s, p2/M, z9.s, z29.s\n"
+ "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
+ "fmla z12.s, p2/M, z9.s, z27.s\n"
+ "fmla z10.s, p2/M, z9.s, z28.s\n"
+ "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
+ "fmla z8.s, p2/M, z9.s, z26.s\n"
+ "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z14.s, p2/M, z7.s, z1.s\n"
+ "ld1w { z1.s }, p1/Z, [x25]\n"
+ "fmla z12.s, p2/M, z7.s, z0.s\n"
+ "fmla z10.s, p2/M, z7.s, z25.s\n"
+ "fmla z8.s, p2/M, z7.s, z24.s\n"
+ "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z14.s, p2/M, z6.s, z0.s\n"
+ "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
+ "fmla z12.s, p2/M, z6.s, z28.s\n"
+ "fmla z10.s, p2/M, z6.s, z24.s\n"
+ "fmla z8.s, p2/M, z6.s, z23.s\n"
+ "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z14.s, p2/M, z5.s, z28.s\n"
+ "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
+ "fmla z12.s, p2/M, z5.s, z26.s\n"
+ "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
+ "fmla z10.s, p2/M, z5.s, z23.s\n"
+ "fmla z8.s, p2/M, z5.s, z22.s\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z14.s, p2/M, z4.s, z25.s\n"
+ "ld1w { z25.s }, p1/Z, [x24]\n"
+ "fmla z12.s, p2/M, z4.s, z24.s\n"
+ "fmla z10.s, p2/M, z4.s, z21.s\n"
+ "ld1w { z21.s }, p1/Z, [x23]\n"
+ "fmla z8.s, p2/M, z4.s, z20.s\n"
+ "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmla z14.s, p2/M, z31.s, z24.s\n"
+ "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
+ "fmla z12.s, p2/M, z31.s, z23.s\n"
+ "fmla z10.s, p2/M, z31.s, z20.s\n"
+ "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
+ "fmla z8.s, p2/M, z31.s, z19.s\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z14.s, p2/M, z30.s, z23.s\n"
+ "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
+ "fmla z12.s, p2/M, z30.s, z22.s\n"
+ "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
+ "fmla z10.s, p2/M, z30.s, z19.s\n"
+ "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
+ "fmla z8.s, p2/M, z30.s, z18.s\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "fmax z14.s, p2/M, z14.s, z17.s\n"
+ "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
+ "fmax z12.s, p2/M, z12.s, z17.s\n"
+ "fmax z10.s, p2/M, z10.s, z17.s\n"
+ "fmax z8.s, p2/M, z8.s, z17.s\n"
+ "fmin z14.s, p2/M, z14.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22]\n"
+ "mov z14.d, z15.d\n"
+ "fmin z12.s, p2/M, z12.s, z16.s\n"
+ "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
+ "mov z12.d, z15.d\n"
+ "addvl x22, x22, #1\n"
+ "fmin z10.s, p2/M, z10.s, z16.s\n"
+ "st1w { z10.s }, p0, [x21]\n"
+ "mov z10.d, z15.d\n"
+ "fmin z8.s, p2/M, z8.s, z16.s\n"
+ "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
+ "mov z8.d, z15.d\n"
+ "addvl x21, x21, #1\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "fmla z14.s, p2/M, z13.s, z3.s\n"
+ "mov p0.b, p1.b\n"
+ "fmla z12.s, p2/M, z13.s, z2.s\n"
+ "fmla z10.s, p2/M, z13.s, z1.s\n"
+ "fmla z8.s, p2/M, z13.s, z0.s\n"
+ "fmla z14.s, p2/M, z11.s, z2.s\n"
+ "fmla z12.s, p2/M, z11.s, z29.s\n"
+ "fmla z10.s, p2/M, z11.s, z0.s\n"
+ "fmla z8.s, p2/M, z11.s, z28.s\n"
+ "fmla z14.s, p2/M, z9.s, z29.s\n"
+ "fmla z12.s, p2/M, z9.s, z27.s\n"
+ "fmla z10.s, p2/M, z9.s, z28.s\n"
+ "fmla z8.s, p2/M, z9.s, z26.s\n"
+ "fmla z14.s, p2/M, z7.s, z1.s\n"
+ "fmla z12.s, p2/M, z7.s, z0.s\n"
+ "fmla z10.s, p2/M, z7.s, z25.s\n"
+ "fmla z8.s, p2/M, z7.s, z24.s\n"
+ "fmla z14.s, p2/M, z6.s, z0.s\n"
+ "fmla z12.s, p2/M, z6.s, z28.s\n"
+ "fmla z10.s, p2/M, z6.s, z24.s\n"
+ "fmla z8.s, p2/M, z6.s, z23.s\n"
+ "fmla z14.s, p2/M, z5.s, z28.s\n"
+ "fmla z12.s, p2/M, z5.s, z26.s\n"
+ "fmla z10.s, p2/M, z5.s, z23.s\n"
+ "fmla z8.s, p2/M, z5.s, z22.s\n"
+ "fmla z14.s, p2/M, z4.s, z25.s\n"
+ "fmla z12.s, p2/M, z4.s, z24.s\n"
+ "fmla z10.s, p2/M, z4.s, z21.s\n"
+ "fmla z8.s, p2/M, z4.s, z20.s\n"
+ "fmla z14.s, p2/M, z31.s, z24.s\n"
+ "fmla z12.s, p2/M, z31.s, z23.s\n"
+ "fmla z10.s, p2/M, z31.s, z20.s\n"
+ "fmla z8.s, p2/M, z31.s, z19.s\n"
+ "fmla z14.s, p2/M, z30.s, z23.s\n"
+ "fmla z12.s, p2/M, z30.s, z22.s\n"
+ "fmla z10.s, p2/M, z30.s, z19.s\n"
+ "fmla z8.s, p2/M, z30.s, z18.s\n"
+ "fmax z14.s, p2/M, z14.s, z17.s\n"
+ "fmax z12.s, p2/M, z12.s, z17.s\n"
+ "fmax z10.s, p2/M, z10.s, z17.s\n"
+ "fmax z8.s, p2/M, z8.s, z17.s\n"
+ "fmin z14.s, p2/M, z14.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22]\n"
+ "fmin z12.s, p2/M, z12.s, z16.s\n"
+ "fmin z10.s, p2/M, z10.s, z16.s\n"
+ "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
+ "fmin z8.s, p2/M, z8.s, z16.s\n"
+ "st1w { z10.s }, p0, [x21]\n"
+ "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
+ : [n_channels] "+r" (n_channels), [params] "+r" (params)
+ : [in_col_stride] "r" (in_col_stride), [in_row_stride] "r" (in_row_stride), [inptr] "r" (inptr), [minmax_vals] "r" (minmax_vals), [out_col_stride] "r" (out_col_stride), [out_row_stride] "r" (out_row_stride), [outptr] "r" (outptr)
+ : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5ec78aa05f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4d0bd311cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "1:" // Tile loop
+ "str x3, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x3\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "cntb x5\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x5, x5, XZR, LSL #4\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cntb x7\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntb x17\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x3, x20\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x4, x8, x19\n" // offset += tile_j * ld_input_col
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x22\n" // offset *= kernel_stride * output_size
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x16, x16, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x13, x16, x20, LSL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x12, x13, x20, LSL #2\n"
+ "ld1w { z16.s }, p3/Z, [x6]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+ "add x11, x12, x20, LSL #2\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+ "add x10, x11, x20, LSL #2\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+ "add x9, x8, x8\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+ "add x28, x9, x8\n"
+ "mov z27.d, z16.d\n"
+ "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+ "add x27, x28, x8\n"
+ "mov z26.d, z16.d\n"
+ "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+ "add x7, x7, x8, LSL #4\n"
+ "mov z25.d, z16.d\n"
+ "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+ "add x17, x17, x9, LSL #4\n"
+ "mov z24.d, z16.d\n"
+ "prfm pldl1keep, [x12, x17]\n"
+ "cntb x26\n"
+ "mov z23.d, z16.d\n"
+ "prfm pldl1keep, [x16, x5]\n"
+ "add x26, x26, x28, LSL #4\n"
+ "cntb x25\n"
+ "mov x20, #0x3\n"
+ "add x25, x25, x27, LSL #4\n"
+ "prfm pldl1keep, [x16, x25]\n"
+ "prfm pldl1keep, [x10, x5]\n"
+ "mul x19, x3, x21\n" // offset = tile_i * ld_output_row
+ "prfm pldl1keep, [x13, x17]\n"
+ "madd x19, x4, x15, x19\n" // offset += tile_j * ld_output_col
+ "add x24, x15, x15\n"
+ "mul x19, x19, x20\n" // offset *= output_tile_size
+ "add x14, x14, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x23, x14, x21, LSL #2\n"
+ "add x22, x23, x21, LSL #2\n"
+ "mov x21, #0x0\n"
+ "cntw x20\n"
+ "sub x19, XZR, x20\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z9.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x16]\n"
+ "addvl x6, x6, #16\n"
+ "ld1w { z11.s }, p2/Z, [x16, x27, LSL #2]\n"
+ "cmp x20, %x[n_channels]\n"
+ "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+ "addvl x6, x6, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x10]\n"
+ "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "whilelt p1.s, x20, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "prfm pldl1keep, [x12, x7]\n"
+ "incw x19\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "prfm pldl1keep, [x16, x7]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x16, x26]\n"
+ "incw x21\n"
+ "fmla z27.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x12, x26]\n"
+ "incw x20\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x13, x5]\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x13, x25]\n"
+ "fmla z24.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x11, x5]\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x11, x17]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x11, x25]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x10, x7]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x13, x7]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "prfm pldl1keep, [x13, x26]\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x10, x26]\n"
+ "fmla z26.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x11, x7]\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "prfm pldl1keep, [x16, x17]\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x11, x26]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x12, x5]\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "prfm pldl1keep, [x12, x25]\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x10, x17]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z16.s }, p3/Z, [x6]\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z24.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z10.s }, p1/Z, [x16]\n"
+ "fmla z23.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12]\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z27.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z9.s }, p1/Z, [x12, x9, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x12, x17]\n"
+ "fmla z24.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x16, x5]\n"
+ "fmla z23.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "prfm pldl1keep, [x16, x25]\n"
+ "addvl x10, x10, #1\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x10, x5]\n"
+ "cmp x20, %x[n_channels]\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x10]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "prfm pldl1keep, [x13, x17]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x16, x27, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z25.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p1/Z, [x13, x9, LSL #2]\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+ "addvl x6, x6, #-6\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z31.s }, p0, [x14]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x14, x15, LSL #2]\n"
+ "mov z30.d, z16.d\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+ "mov z29.d, z16.d\n"
+ "addvl x14, x14, #1\n"
+ "fmax z27.s, p3/M, z27.s, z18.s\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "mov z28.d, z16.d\n"
+ "fmax z26.s, p3/M, z26.s, z18.s\n"
+ "fmax z25.s, p3/M, z25.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z18.s\n"
+ "fmin z27.s, p3/M, z27.s, z17.s\n"
+ "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+ "mov z27.d, z16.d\n"
+ "fmin z26.s, p3/M, z26.s, z17.s\n"
+ "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
+ "mov z26.d, z16.d\n"
+ "addvl x23, x23, #1\n"
+ "fmin z25.s, p3/M, z25.s, z17.s\n"
+ "st1w { z25.s }, p0, [x22]\n"
+ "mov z25.d, z16.d\n"
+ "fmin z24.s, p3/M, z24.s, z17.s\n"
+ "st1w { z24.s }, p0, [x22, x15, LSL #2]\n"
+ "mov z24.d, z16.d\n"
+ "fmax z23.s, p3/M, z23.s, z18.s\n"
+ "fmin z23.s, p3/M, z23.s, z17.s\n"
+ "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
+ "mov z23.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "prfm pldl1keep, [x12, x7]\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "prfm pldl1keep, [x16, x7]\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x16, x26]\n"
+ "fmla z27.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x12, x26]\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x13, x5]\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x13, x25]\n"
+ "fmla z24.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x11, x5]\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x11, x17]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x11, x25]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x10, x7]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x13, x7]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "prfm pldl1keep, [x13, x26]\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x10, x26]\n"
+ "fmla z26.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x11, x7]\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "prfm pldl1keep, [x16, x17]\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x11, x26]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x12, x5]\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "prfm pldl1keep, [x12, x25]\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x10, x17]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x3, #0x1\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x4, x4, #0x1\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x4, x19\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "csel x4, x4, XZR, LT\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "csel x3, x3, x21, LT\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "cmp x3, x20\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z13.s\n"
+ "fmla z23.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12]\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z13.s\n"
+ "fmla z26.s, p3/M, z7.s, z13.s\n"
+ "fmla z24.s, p3/M, z5.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z8.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x14]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x14, x15, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmax z27.s, p3/M, z27.s, z18.s\n"
+ "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z18.s\n"
+ "fmax z25.s, p3/M, z25.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z18.s\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x23]\n"
+ "fmin z27.s, p3/M, z27.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z17.s\n"
+ "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z17.s\n"
+ "fmin z24.s, p3/M, z24.s, z17.s\n"
+ "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z18.s\n"
+ "st1w { z25.s }, p0, [x22]\n"
+ "fmin z23.s, p3/M, z23.s, z17.s\n"
+ "st1w { z24.s }, p0, [x22, x15, LSL #2]\n"
+ "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..7c6fb306b7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntb x14, ALL, MUL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x13, #0x0\n"
+ "ld1w { z16.s }, p3/Z, [x16]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "cntw x12\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "sub x11, XZR, x12\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "mov z27.d, z16.d\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "mov z26.d, z16.d\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "mov z25.d, z16.d\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "mov z24.d, z16.d\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "mov z23.d, z16.d\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "ldp x10, x22, [x15, #0x0]\n"
+ "ldp x9, x28, [x15, #0x10]\n"
+ "ldr x24, [x15, #0x20]\n"
+ "ld1w { z9.s }, p2/Z, [x10, x13, LSL #2]\n"
+ "prfm pldl1keep, [x10, x14]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "prfm pldl1keep, [x9, x14]\n"
+ "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x15, #0x28]\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x15, #0x30]\n"
+ "incw x11\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x26, [x15, #0x38]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x27, x14]\n"
+ "fmla z27.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "ldr x25, [x15, #0x40]\n"
+ "fmla z24.s, p3/M, z1.s, z9.s\n"
+ "ldr x19, [x15, #0x48]\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ldr x24, [x15, #0x50]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "ldr x23, [x15, #0x58]\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "ldr x22, [x15, #0x60]\n"
+ "fmla z26.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "ldr x10, [x15, #0x80]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ldr x22, [x15, #0x88]\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "prfm pldl1keep, [x10, x14]\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ldr x9, [x15, #0x90]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x9, x14]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "ldr x28, [x15, #0x98]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "ldr x27, [x15, #0xa8]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "prfm pldl1keep, [x27, x14]\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "ldr x26, [x15, #0xb8]\n"
+ "fmla z25.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z24.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "ldr x25, [x15, #0xc0]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ldp x10, x22, [x15, #0x0]\n"
+ "fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z9.s }, p1/Z, [x10, x12, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "prfm pldl1keep, [x10, x14]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z10.s }, p1/Z, [x22, x12, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "ldp x9, x28, [x15, #0x10]\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ldr x24, [x15, #0x20]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x9, x14]\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z13.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z26.s, p3/M, z7.s, z13.s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla z24.s, p3/M, z5.s, z13.s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla z23.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "incw x13\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "whilelt p2.s, x13, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x28, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p3/Z, [x16]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x9, x12, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p1/Z, [x24, x12, LSL #2]\n"
+ "incw x12\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmax z27.s, p3/M, z27.s, z18.s\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
+ "mov z31.d, z16.d\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+ "mov z30.d, z16.d\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
+ "mov z29.d, z16.d\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin z27.s, p3/M, z27.s, z17.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmax z26.s, p3/M, z26.s, z18.s\n"
+ "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
+ "mov z28.d, z16.d\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmax z25.s, p3/M, z25.s, z18.s\n"
+ "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
+ "mov z27.d, z16.d\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin z26.s, p3/M, z26.s, z17.s\n"
+ "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
+ "mov z26.d, z16.d\n"
+ "fmin z25.s, p3/M, z25.s, z17.s\n"
+ "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
+ "mov z25.d, z16.d\n"
+ "fmax z24.s, p3/M, z24.s, z18.s\n"
+ "fmax z23.s, p3/M, z23.s, z18.s\n"
+ "fmin z24.s, p3/M, z24.s, z17.s\n"
+ "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
+ "mov z24.d, z16.d\n"
+ "fmin z23.s, p3/M, z23.s, z17.s\n"
+ "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+ "mov z23.d, z16.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x15, #0x28]\n"
+ "incw x11\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x15, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x26, [x15, #0x38]\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x27, x14]\n"
+ "fmla z27.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "ldr x25, [x15, #0x40]\n"
+ "fmla z24.s, p3/M, z1.s, z9.s\n"
+ "ldr x19, [x15, #0x48]\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ldr x24, [x15, #0x50]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z28.s, p3/M, z2.s, z13.s\n"
+ "ldr x23, [x15, #0x58]\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "ldr x22, [x15, #0x60]\n"
+ "fmla z26.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "ldr x10, [x15, #0x80]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "ldr x22, [x15, #0x88]\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "prfm pldl1keep, [x10, x14]\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ldr x9, [x15, #0x90]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x9, x14]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "ldr x28, [x15, #0x98]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "prfm pldl1keep, [x28, x14]\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "ldr x27, [x15, #0xa8]\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "prfm pldl1keep, [x27, x14]\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "ldr x26, [x15, #0xb8]\n"
+ "fmla z25.s, p3/M, z7.s, z13.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z24.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "ldr x25, [x15, #0xc0]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z13.s\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla z23.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z13.s\n"
+ "fmla z26.s, p3/M, z7.s, z13.s\n"
+ "fmla z24.s, p3/M, z5.s, z13.s\n"
+ "fmla z23.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z8.s, z13.s\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z18.s\n"
+ "fmax z26.s, p3/M, z26.s, z18.s\n"
+ "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax z25.s, p3/M, z25.s, z18.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmax z24.s, p3/M, z24.s, z18.s\n"
+ "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z17.s\n"
+ "fmin z26.s, p3/M, z26.s, z17.s\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin z25.s, p3/M, z25.s, z17.s\n"
+ "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z17.s\n"
+ "fmax z23.s, p3/M, z23.s, z18.s\n"
+ "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
+ "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z17.s\n"
+ "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..a9823e3917
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4c24ad9c15
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x24, #0x4\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x23, #0x4\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x5, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cntw x6\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x21, XZR, x6\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x15, x8, x22, LSL #2\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x14, x15, x22, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x4]\n"
+ "mov z31.d, z13.d\n"
+ "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
+ "add x13, x14, x22, LSL #2\n"
+ "mov z30.d, z13.d\n"
+ "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
+ "add x12, x13, x22, LSL #2\n"
+ "mov z29.d, z13.d\n"
+ "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
+ "add x11, x12, x22, LSL #2\n"
+ "mov z28.d, z13.d\n"
+ "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
+ "add x10, x7, x7\n"
+ "mov z27.d, z13.d\n"
+ "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
+ "add x9, x10, x7\n"
+ "mov z26.d, z13.d\n"
+ "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
+ "add x28, x9, x7\n"
+ "mov z25.d, z13.d\n"
+ "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
+ "add x27, x28, x7\n"
+ "mov z24.d, z13.d\n"
+ "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
+ "mov z23.d, z13.d\n"
+ "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
+ "mov z22.d, z13.d\n"
+ "mul x19, x19, x23\n" // offset *= output_tile_size
+ "mov z21.d, z13.d\n"
+ "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "mov z20.d, z13.d\n"
+ "add x26, x16, x20, LSL #2\n"
+ "mov z19.d, z13.d\n"
+ "add x25, x26, x20, LSL #2\n"
+ "mov z18.d, z13.d\n"
+ "add x24, x25, x20, LSL #2\n"
+ "mov z17.d, z13.d\n"
+ "add x23, x17, x17\n"
+ "mov z16.d, z13.d\n"
+ "add x22, x23, x17\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z9.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x8]\n"
+ "addvl x4, x4, #16\n"
+ "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
+ "cmp x6, %x[n_channels]\n"
+ "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
+ "addvl x4, x4, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z13.s }, p3/Z, [x4]\n"
+ "whilelt p1.s, x6, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "incw x21\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "incw x5\n"
+ "fmla z26.s, p3/M, z4.s, z9.s\n"
+ "incw x6\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "fmla z26.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z20.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z19.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x15]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z11.s\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z20.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z27.s, p3/M, z6.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z16.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z19.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z16.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "whilelt p2.s, x5, %x[n_channels]\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "cmp x6, %x[n_channels]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x14, x9, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p1/Z, [x8]\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
+ "addvl x4, x4, #16\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
+ "addvl x4, x4, #-6\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z31.s }, p0, [x16]\n"
+ "mov z31.d, z13.d\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
+ "mov z30.d, z13.d\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "st1w { z29.s }, p0, [x16, x23, LSL #2]\n"
+ "mov z29.d, z13.d\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
+ "mov z28.d, z13.d\n"
+ "addvl x16, x16, #1\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "st1w { z27.s }, p0, [x26]\n"
+ "mov z27.d, z13.d\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "st1w { z26.s }, p0, [x26, x17, LSL #2]\n"
+ "mov z26.d, z13.d\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "st1w { z25.s }, p0, [x26, x23, LSL #2]\n"
+ "mov z25.d, z13.d\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z24.s }, p0, [x26, x22, LSL #2]\n"
+ "mov z24.d, z13.d\n"
+ "addvl x26, x26, #1\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "mov z23.d, z13.d\n"
+ "fmin z22.s, p3/M, z22.s, z14.s\n"
+ "st1w { z22.s }, p0, [x25, x17, LSL #2]\n"
+ "mov z22.d, z13.d\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p0, [x25, x23, LSL #2]\n"
+ "mov z21.d, z13.d\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z19.s, p3/M, z19.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "st1w { z20.s }, p0, [x25, x22, LSL #2]\n"
+ "mov z20.d, z13.d\n"
+ "addvl x25, x25, #1\n"
+ "fmin z19.s, p3/M, z19.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24]\n"
+ "mov z19.d, z13.d\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "st1w { z18.s }, p0, [x24, x17, LSL #2]\n"
+ "mov z18.d, z13.d\n"
+ "fmin z17.s, p3/M, z17.s, z14.s\n"
+ "st1w { z17.s }, p0, [x24, x23, LSL #2]\n"
+ "mov z17.d, z13.d\n"
+ "fmax z16.s, p3/M, z16.s, z15.s\n"
+ "fmin z16.s, p3/M, z16.s, z14.s\n"
+ "st1w { z16.s }, p0, [x24, x22, LSL #2]\n"
+ "mov z16.d, z13.d\n"
+ "addvl x24, x24, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x21, x2, #0x1\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z26.s, p3/M, z4.s, z9.s\n"
+ "cmp x3, x19\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "csel x2, x2, x21, LT\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "cmp x2, x20\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "fmla z26.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z20.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z19.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x15]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z11.s\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z20.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z16.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
+ "fmla z19.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z16.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z31.s }, p0, [x16]\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "st1w { z29.s }, p0, [x16, x23, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "st1w { z27.s }, p0, [x26]\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "st1w { z26.s }, p0, [x26, x17, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "st1w { z25.s }, p0, [x26, x23, LSL #2]\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z24.s }, p0, [x26, x22, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "fmin z22.s, p3/M, z22.s, z14.s\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "st1w { z22.s }, p0, [x25, x17, LSL #2]\n"
+ "fmax z19.s, p3/M, z19.s, z15.s\n"
+ "st1w { z21.s }, p0, [x25, x23, LSL #2]\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "st1w { z20.s }, p0, [x25, x22, LSL #2]\n"
+ "fmin z19.s, p3/M, z19.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24]\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "fmin z17.s, p3/M, z17.s, z14.s\n"
+ "st1w { z18.s }, p0, [x24, x17, LSL #2]\n"
+ "fmax z16.s, p3/M, z16.s, z15.s\n"
+ "st1w { z17.s }, p0, [x24, x23, LSL #2]\n"
+ "fmin z16.s, p3/M, z16.s, z14.s\n"
+ "st1w { z16.s }, p0, [x24, x22, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..ac0c4ec4e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x7, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntb x8, ALL, MUL #2\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x17, #0x0\n"
+ "ld1w { z13.s }, p3/Z, [x6]\n"
+ "mov z31.d, z13.d\n"
+ "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+ "cntw x16\n"
+ "mov z30.d, z13.d\n"
+ "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+ "sub x15, XZR, x16\n"
+ "mov z29.d, z13.d\n"
+ "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mov z28.d, z13.d\n"
+ "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "mov z27.d, z13.d\n"
+ "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+ "mov z26.d, z13.d\n"
+ "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+ "mov z25.d, z13.d\n"
+ "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "mov z24.d, z13.d\n"
+ "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+ "mov z23.d, z13.d\n"
+ "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+ "addvl x6, x6, #-6\n"
+ "mov z22.d, z13.d\n"
+ "ldp x14, x13, [x7, #0x0]\n"
+ "mov z21.d, z13.d\n"
+ "ldp x12, x11, [x7, #0x10]\n"
+ "mov z20.d, z13.d\n"
+ "ld1w { z9.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "mov z19.d, z13.d\n"
+ "mov z18.d, z13.d\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "mov z17.d, z13.d\n"
+ "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "mov z16.d, z13.d\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x7, #0x20]\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "ldr x21, [x7, #0x28]\n"
+ "incw x15\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x20, [x7, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x22, x8]\n"
+ "fmla z26.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x21, x8]\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x20, x8]\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x19, [x7, #0x38]\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "ldr x10, [x7, #0x40]\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "prfm pldl1keep, [x19, x8]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla z26.s, p3/M, z5.s, z12.s\n"
+ "ldr x9, [x7, #0x48]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ldr x28, [x7, #0x50]\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ldr x27, [x7, #0x58]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x28, x8]\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla z20.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "ldr x26, [x7, #0x60]\n"
+ "fmla z26.s, p3/M, z7.s, z9.s\n"
+ "ldr x25, [x7, #0x68]\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
+ "ldr x24, [x7, #0x70]\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x25, x8]\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x7, #0x78]\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "ldr x14, [x7, #0x80]\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x23, x8]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "ldr x13, [x7, #0x88]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "ldr x12, [x7, #0x90]\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ldr x11, [x7, #0x98]\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "ldr x22, [x7, #0xa0]\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "ldr x21, [x7, #0xa8]\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x22, x8]\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x21, x8]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "ldr x20, [x7, #0xb0]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "ldr x19, [x7, #0xb8]\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x20, x8]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x19, x8]\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "ldr x10, [x7, #0xc0]\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "ldr x9, [x7, #0xc8]\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "ldr x28, [x7, #0xd0]\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z11.s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x28, x8]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ldr x27, [x7, #0xd8]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ldr x26, [x7, #0xe0]\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ldr x25, [x7, #0xe8]\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "prfm pldl1keep, [x25, x8]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x24, [x7, #0xf0]\n"
+ "fmla z26.s, p3/M, z3.s, z10.s\n"
+ "ldr x23, [x7, #0xf8]\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ldr x14, [x7, #0x100]\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "prfm pldl1keep, [x23, x8]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ldr x13, [x7, #0x108]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "ldr x12, [x7, #0x110]\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "ldr x11, [x7, #0x118]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "ldr x22, [x5, #0x0]\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "ldr x21, [x5, #0x8]\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "ldr x20, [x5, #0x10]\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "ldr x19, [x5, #0x18]\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z13.s }, p3/Z, [x6]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z20.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z16.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z16.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "ldp x14, x13, [x7, #0x0]\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z10.s\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "incw x17\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "ldp x12, x11, [x7, #0x10]\n"
+ "whilelt p2.s, x17, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x12, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x11, x16, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p1/Z, [x13, x16, LSL #2]\n"
+ "incw x16\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+ "addvl x6, x6, #-6\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z31.s }, p0, [x22, x15, LSL #2]\n"
+ "mov z31.d, z13.d\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "ldr x22, [x5, #0x20]\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "st1w { z30.s }, p0, [x21, x15, LSL #2]\n"
+ "mov z30.d, z13.d\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "st1w { z29.s }, p0, [x20, x15, LSL #2]\n"
+ "mov z29.d, z13.d\n"
+ "ldr x21, [x5, #0x28]\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "ldr x20, [x5, #0x30]\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "st1w { z28.s }, p0, [x19, x15, LSL #2]\n"
+ "mov z28.d, z13.d\n"
+ "ldr x19, [x5, #0x38]\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "st1w { z27.s }, p0, [x22, x15, LSL #2]\n"
+ "mov z27.d, z13.d\n"
+ "ldr x22, [x5, #0x40]\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "st1w { z26.s }, p0, [x21, x15, LSL #2]\n"
+ "mov z26.d, z13.d\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "ldr x21, [x5, #0x48]\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "st1w { z25.s }, p0, [x20, x15, LSL #2]\n"
+ "mov z25.d, z13.d\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "ldr x20, [x5, #0x50]\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z24.s }, p0, [x19, x15, LSL #2]\n"
+ "mov z24.d, z13.d\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "ldr x19, [x5, #0x58]\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "st1w { z23.s }, p0, [x22, x15, LSL #2]\n"
+ "mov z23.d, z13.d\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "ldr x22, [x5, #0x60]\n"
+ "fmin z22.s, p3/M, z22.s, z14.s\n"
+ "st1w { z22.s }, p0, [x21, x15, LSL #2]\n"
+ "mov z22.d, z13.d\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "ldr x21, [x5, #0x68]\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "st1w { z21.s }, p0, [x20, x15, LSL #2]\n"
+ "mov z21.d, z13.d\n"
+ "fmax z19.s, p3/M, z19.s, z15.s\n"
+ "ldr x20, [x5, #0x70]\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "st1w { z20.s }, p0, [x19, x15, LSL #2]\n"
+ "mov z20.d, z13.d\n"
+ "fmin z19.s, p3/M, z19.s, z14.s\n"
+ "ldr x19, [x5, #0x78]\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "st1w { z19.s }, p0, [x22, x15, LSL #2]\n"
+ "mov z19.d, z13.d\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "st1w { z18.s }, p0, [x21, x15, LSL #2]\n"
+ "mov z18.d, z13.d\n"
+ "fmin z17.s, p3/M, z17.s, z14.s\n"
+ "st1w { z17.s }, p0, [x20, x15, LSL #2]\n"
+ "mov z17.d, z13.d\n"
+ "fmax z16.s, p3/M, z16.s, z15.s\n"
+ "fmin z16.s, p3/M, z16.s, z14.s\n"
+ "st1w { z16.s }, p0, [x19, x15, LSL #2]\n"
+ "mov z16.d, z13.d\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x7, #0x20]\n"
+ "incw x15\n"
+ "fmla z30.s, p3/M, z7.s, z9.s\n"
+ "ldr x21, [x7, #0x28]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x20, [x7, #0x30]\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x22, x8]\n"
+ "fmla z26.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x21, x8]\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x20, x8]\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x19, [x7, #0x38]\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "ldr x10, [x7, #0x40]\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "prfm pldl1keep, [x19, x8]\n"
+ "fmla z29.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla z26.s, p3/M, z5.s, z12.s\n"
+ "ldr x9, [x7, #0x48]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ldr x28, [x7, #0x50]\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ldr x27, [x7, #0x58]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x28, x8]\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla z20.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "ldr x26, [x7, #0x60]\n"
+ "fmla z26.s, p3/M, z7.s, z9.s\n"
+ "ldr x25, [x7, #0x68]\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
+ "ldr x24, [x7, #0x70]\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "prfm pldl1keep, [x25, x8]\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x7, #0x78]\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "ldr x14, [x7, #0x80]\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x23, x8]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "ldr x13, [x7, #0x88]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "ldr x12, [x7, #0x90]\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ldr x11, [x7, #0x98]\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "ldr x22, [x7, #0xa0]\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "ldr x21, [x7, #0xa8]\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x22, x8]\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x21, x8]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "ldr x20, [x7, #0xb0]\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "ldr x19, [x7, #0xb8]\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "prfm pldl1keep, [x20, x8]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x19, x8]\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "ldr x10, [x7, #0xc0]\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "ldr x9, [x7, #0xc8]\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "ldr x28, [x7, #0xd0]\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z11.s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x28, x8]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ldr x27, [x7, #0xd8]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ldr x26, [x7, #0xe0]\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ldr x25, [x7, #0xe8]\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "prfm pldl1keep, [x25, x8]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x24, [x7, #0xf0]\n"
+ "fmla z26.s, p3/M, z3.s, z10.s\n"
+ "ldr x23, [x7, #0xf8]\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ldr x14, [x7, #0x100]\n"
+ "fmla z22.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "prfm pldl1keep, [x23, x8]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x14, x8]\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ldr x13, [x7, #0x108]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "ldr x12, [x7, #0x110]\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "ldr x11, [x7, #0x118]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "prfm pldl1keep, [x13, x8]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "prfm pldl1keep, [x12, x8]\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "prfm pldl1keep, [x11, x8]\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "ldr x22, [x5, #0x0]\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "ldr x21, [x5, #0x8]\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "ldr x20, [x5, #0x10]\n"
+ "fmla z19.s, p3/M, z1.s, z11.s\n"
+ "ldr x19, [x5, #0x18]\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z20.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z16.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z19.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z16.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "st1w { z31.s }, p0, [x22, x15, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "ldr x22, [x5, #0x20]\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "st1w { z30.s }, p0, [x21, x15, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "st1w { z29.s }, p0, [x20, x15, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "st1w { z28.s }, p0, [x19, x15, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "ldr x21, [x5, #0x28]\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "ldr x20, [x5, #0x30]\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "ldr x19, [x5, #0x38]\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "st1w { z27.s }, p0, [x22, x15, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z26.s }, p0, [x21, x15, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "ldr x22, [x5, #0x40]\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "ldr x21, [x5, #0x48]\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "st1w { z25.s }, p0, [x20, x15, LSL #2]\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "st1w { z24.s }, p0, [x19, x15, LSL #2]\n"
+ "fmax z19.s, p3/M, z19.s, z15.s\n"
+ "st1w { z23.s }, p0, [x22, x15, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z14.s\n"
+ "ldr x20, [x5, #0x50]\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "ldr x19, [x5, #0x58]\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "ldr x22, [x5, #0x60]\n"
+ "fmin z19.s, p3/M, z19.s, z14.s\n"
+ "st1w { z22.s }, p0, [x21, x15, LSL #2]\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "st1w { z21.s }, p0, [x20, x15, LSL #2]\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "st1w { z20.s }, p0, [x19, x15, LSL #2]\n"
+ "fmax z16.s, p3/M, z16.s, z15.s\n"
+ "st1w { z19.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x21, [x5, #0x68]\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "ldr x20, [x5, #0x70]\n"
+ "fmin z17.s, p3/M, z17.s, z14.s\n"
+ "ldr x19, [x5, #0x78]\n"
+ "fmin z16.s, p3/M, z16.s, z14.s\n"
+ "st1w { z18.s }, p0, [x21, x15, LSL #2]\n"
+ "st1w { z17.s }, p0, [x20, x15, LSL #2]\n"
+ "st1w { z16.s }, p0, [x19, x15, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f5b6a4f8ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..ad53872630
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x5, #0x0\n"
+ "mov x6, #0x0\n"
+ "1:" // Tile loop
+ "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x21, #0x4\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "cntb x7\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x7, x7, XZR, LSL #4\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cntb x17\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntb x15\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x5, x20\n" // offset = tile_i * ld_input_row
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x6, x16, x19\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x14, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x11, x14, x20, LSL #2\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x10, x11, x20, LSL #2\n"
+ "ld1w { z17.s }, p3/Z, [x8]\n"
+ "mov z31.d, z17.d\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "add x9, x10, x20, LSL #2\n"
+ "mov z30.d, z17.d\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "add x28, x9, x20, LSL #2\n"
+ "mov z29.d, z17.d\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "add x27, x16, x16\n"
+ "mov z28.d, z17.d\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "add x26, x27, x16\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "add x25, x26, x16\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "add x17, x17, x16, LSL #4\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "add x15, x15, x27, LSL #4\n"
+ "cntb x24\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "prfm pldl1keep, [x14, x7]\n"
+ "add x24, x24, x26, LSL #4\n"
+ "prfm pldl1keep, [x14, x17]\n"
+ "cntb x23\n"
+ "prfm pldl1keep, [x14, x24]\n"
+ "add x23, x23, x25, LSL #4\n"
+ "mov x20, #0x2\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "prfm pldl1keep, [x11, x7]\n"
+ "mul x19, x5, x22\n" // offset = tile_i * ld_output_row
+ "prfm pldl1keep, [x11, x17]\n"
+ "madd x19, x6, x13, x19\n" // offset += tile_j * ld_output_col
+ "prfm pldl1keep, [x14, x15]\n"
+ "mul x19, x19, x20\n" // offset *= output_tile_size
+ "mov x21, #0x0\n"
+ "add x12, x12, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x22, x12, x22, LSL #2\n"
+ "cntw x20\n"
+ "sub x19, XZR, x20\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z9.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
+ "addvl x8, x8, #16\n"
+ "ld1w { z11.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "cmp x20, %x[n_channels]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x14, x25, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x11]\n"
+ "ld1w { z15.s }, p2/Z, [x11, x16, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "prfm pldl1keep, [x11, x24]\n"
+ "whilelt p1.s, x20, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "prfm pldl1keep, [x11, x23]\n"
+ "incw x19\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x11, x15]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x9, x7]\n"
+ "incw x21\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x10, x7]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n"
+ "incw x20\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "prfm pldl1keep, [x9, x17]\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x9]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "prfm pldl1keep, [x10, x17]\n"
+ "prfm pldl1keep, [x9, x24]\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "prfm pldl1keep, [x10, x24]\n"
+ "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x10]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z15.s\n"
+ "prfm pldl1keep, [x9, x23]\n"
+ "prfm pldl1keep, [x28, x7]\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "prfm pldl1keep, [x10, x23]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "addvl x10, x10, #1\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "prfm pldl1keep, [x28, x17]\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "prfm pldl1keep, [x28, x24]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z15.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "prfm pldl1keep, [x28, x15]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x28, x23]\n"
+ "fmla z29.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z9.s }, p1/Z, [x10, x27, LSL #2]\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "ld1w { z10.s }, p1/Z, [x14]\n"
+ "fmla z28.s, p3/M, z5.s, z14.s\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "prfm pldl1keep, [x14, x7]\n"
+ "prfm pldl1keep, [x14, x17]\n"
+ "fmin z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmin z30.s, p3/M, z30.s, z18.s\n"
+ "prfm pldl1keep, [x14, x24]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z13.s }, p1/Z, [x14, x25, LSL #2]\n"
+ "cmp x20, %x[n_channels]\n"
+ "fmla z29.s, p3/M, z8.s, z15.s\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "prfm pldl1keep, [x11, x7]\n"
+ "fmla z28.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [x11]\n"
+ "prfm pldl1keep, [x11, x17]\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "ld1w { z16.s }, p1/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z15.s }, p1/Z, [x11, x16, LSL #2]\n"
+ "prfm pldl1keep, [x14, x15]\n"
+ "fmin z29.s, p3/M, z29.s, z18.s\n"
+ "st1w { z31.s }, p0, [x12]\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x14, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "addvl x12, x12, #1\n"
+ "fmin z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z17.s }, p3/Z, [x8]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "mov z31.d, z17.d\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "mov z30.d, z17.d\n"
+ "st1w { z28.s }, p0, [x22, x13, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "mov z29.d, z17.d\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "mov z28.d, z17.d\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "prfm pldl1keep, [x11, x24]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "prfm pldl1keep, [x11, x23]\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "prfm pldl1keep, [x11, x15]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x9, x7]\n"
+ "prfm pldl1keep, [x10, x7]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x9, x17]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "prfm pldl1keep, [x10, x17]\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x9]\n"
+ "prfm pldl1keep, [x9, x24]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "prfm pldl1keep, [x10, x24]\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "prfm pldl1keep, [x9, x23]\n"
+ "fmla z31.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x10]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z0.s, z15.s\n"
+ "prfm pldl1keep, [x28, x7]\n"
+ "prfm pldl1keep, [x10, x23]\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "prfm pldl1keep, [x28, x17]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "prfm pldl1keep, [x28, x24]\n"
+ "prfm pldl1keep, [x28, x15]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z15.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "prfm pldl1keep, [x28, x23]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x5, #0x1\n"
+ "fmla z28.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x6, x6, #0x1\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x6, x19\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "csel x6, x6, XZR, LT\n"
+ "fmin z31.s, p3/M, z31.s, z18.s\n"
+ "st1w { z31.s }, p0, [x12]\n"
+ "fmla z28.s, p3/M, z7.s, z14.s\n"
+ "csel x5, x5, x21, LT\n"
+ "fmla z29.s, p3/M, z8.s, z15.s\n"
+ "cmp x5, x20\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "fmla z28.s, p3/M, z6.s, z15.s\n"
+ "fmin z30.s, p3/M, z30.s, z18.s\n"
+ "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "fmin z29.s, p3/M, z29.s, z18.s\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "fmin z28.s, p3/M, z28.s, z18.s\n"
+ "st1w { z28.s }, p0, [x22, x13, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..06b3575d4b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntb x12, ALL, MUL #2\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [x19, #0x0]\n"
+ "cntw x28\n"
+ "ldp x27, x26, [x19, #0x10]\n"
+ "sub x25, XZR, x28\n"
+ "ld1w { z17.s }, p3/Z, [x14]\n"
+ "mov z31.d, z17.d\n"
+ "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mov z30.d, z17.d\n"
+ "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+ "cmp x28, %x[n_channels]\n"
+ "mov z29.d, z17.d\n"
+ "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+ "mov z28.d, z17.d\n"
+ "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+ "addvl x14, x14, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "ld1w { z10.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "ldp x24, x23, [x13, #0x10]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldp x20, x19, [x13, #0x30]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x11, LSL #2]\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "ld1w { z12.s }, p2/Z, [x23, x11, LSL #2]\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "ld1w { z14.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "ld1w { z15.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x13, #0x40]\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x13, #0x48]\n"
+ "incw x25\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x70]\n"
+ "fmla z31.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z29.s, p3/M, z0.s, z15.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr x21, [x13, #0x80]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ldr x24, [x13, #0x90]\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "ldr x23, [x13, #0x98]\n"
+ "fmla z29.s, p3/M, z6.s, z15.s\n"
+ "ldr x22, [x13, #0xa0]\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "ldr x21, [x13, #0xa8]\n"
+ "fmla z28.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmin z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "ldr x22, [x13, #0xc0]\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmin z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z28.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z15.s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "incw x11\n"
+ "fmla z28.s, p3/M, z6.s, z15.s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "whilelt p2.s, x11, %x[n_channels]\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "ldp x24, x23, [x13, #0x10]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z9.s }, p1/Z, [x21, x28, LSL #2]\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmin z29.s, p3/M, z29.s, z18.s\n"
+ "ld1w { z10.s }, p1/Z, [x20, x28, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "ld1w { z11.s }, p1/Z, [x24, x28, LSL #2]\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "fmin z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x23, x28, LSL #2]\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldp x20, x19, [x13, #0x30]\n"
+ "st1w { z31.s }, p0, [x10, x25, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "ld1w { z14.s }, p1/Z, [x21, x28, LSL #2]\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "ld1w { z15.s }, p1/Z, [x20, x28, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "ld1w { z16.s }, p1/Z, [x19, x28, LSL #2]\n"
+ "incw x28\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "cmp x28, %x[n_channels]\n"
+ "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+ "st1w { z29.s }, p0, [x27, x25, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x25, LSL #2]\n"
+ "ld1w { z17.s }, p3/Z, [x14]\n"
+ "mov z31.d, z17.d\n"
+ "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+ "mov z30.d, z17.d\n"
+ "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+ "mov z29.d, z17.d\n"
+ "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+ "mov z28.d, z17.d\n"
+ "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+ "addvl x14, x14, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x22, [x13, #0x40]\n"
+ "incw x25\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x13, #0x48]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x70]\n"
+ "fmla z31.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z29.s, p3/M, z0.s, z15.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr x21, [x13, #0x80]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ldr x24, [x13, #0x90]\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z31.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "ldr x23, [x13, #0x98]\n"
+ "fmla z29.s, p3/M, z6.s, z15.s\n"
+ "ldr x22, [x13, #0xa0]\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "ldr x21, [x13, #0xa8]\n"
+ "fmla z28.s, p3/M, z5.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmin z31.s, p3/M, z31.s, z18.s\n"
+ "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x21, x12]\n"
+ "fmla z29.s, p3/M, z7.s, z13.s\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "ldr x22, [x13, #0xc0]\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmin z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n"
+ "prfm pldl1keep, [x19, x12]\n"
+ "fmla z28.s, p3/M, z7.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z15.s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "st1w { z31.s }, p0, [x10, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z15.s\n"
+ "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmin z29.s, p3/M, z29.s, z18.s\n"
+ "st1w { z29.s }, p0, [x27, x25, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "fmin z28.s, p3/M, z28.s, z18.s\n"
+ "st1w { z28.s }, p0, [x26, x25, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d49f7fdceb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..f751186dce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x5, #0x0\n"
+ "mov x6, #0x0\n"
+ "1:" // Tile loop
+ "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x20, #0x2\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x7, #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x17, #0x0\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "cntw x16\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "sub x14, XZR, x16\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x13, x13, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x20, x13, x22, LSL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x10, x20, x22, LSL #2\n"
+ "ld1w { z16.s }, p3/Z, [x8]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "add x9, x10, x22, LSL #2\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "add x28, x9, x22, LSL #2\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "add x27, x28, x22, LSL #2\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "add x26, x15, x15\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "add x25, x26, x15\n"
+ "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
+ "add x24, x25, x15\n"
+ "add x23, x24, x15\n"
+ "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x7\n" // offset *= output_tile_size
+ "add x11, x11, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x22, x11, x21, LSL #2\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z5.s }, p2/Z, [x13]\n"
+ "ld1w { z6.s }, p2/Z, [x13, x15, LSL #2]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z7.s }, p2/Z, [x20]\n"
+ "addvl x8, x8, #6\n"
+ "ld1w { z8.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x13, x26, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x26, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x25, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x13, x24, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x20, x23, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x10]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "incw x14\n"
+ "fmla z29.s, p3/M, z0.s, z7.s\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z0.s }, p3/Z, [x8]\n"
+ "incw x17\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
+ "addvl x20, x20, #1\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "incw x16\n"
+ "fmla z29.s, p3/M, z1.s, z8.s\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z5.s\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z6.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z7.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "fmla z30.s, p3/M, z0.s, z8.s\n"
+ "fmla z29.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z8.s\n"
+ "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
+ "addvl x10, x10, #1\n"
+ "fmla z30.s, p3/M, z2.s, z5.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "fmla z31.s, p3/M, z3.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmla z30.s, p3/M, z3.s, z6.s\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z5.s\n"
+ "fmla z28.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z6.s\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "fmla z28.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p3/Z, [x8]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z5.s\n"
+ "fmla z28.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "fmla z28.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z14.s }, p1/Z, [x10]\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z13.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z5.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x13]\n"
+ "fmla z30.s, p3/M, z2.s, z6.s\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
+ "whilelt p2.s, x17, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z6.s }, p1/Z, [x13, x15, LSL #2]\n"
+ "addvl x8, x8, #16\n"
+ "fmla z30.s, p3/M, z3.s, z8.s\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x13, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z8.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p1/Z, [x20, x23, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x13, x24, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z9.s }, p1/Z, [x13, x26, LSL #2]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "addvl x8, x8, #-6\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x11]\n"
+ "mov z31.d, z16.d\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+ "mov z30.d, z16.d\n"
+ "addvl x11, x11, #1\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "mov z29.d, z16.d\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+ "mov z28.d, z16.d\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x5, #0x1\n"
+ "fmla z29.s, p3/M, z0.s, z7.s\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "fmla z28.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z0.s }, p3/Z, [x8]\n"
+ "add x6, x6, #0x1\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z29.s, p3/M, z1.s, z8.s\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x6, x19\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
+ "csel x6, x6, XZR, LT\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "csel x5, x5, x21, LT\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "cmp x5, x20\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z5.s\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z6.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z7.s\n"
+ "fmla z30.s, p3/M, z0.s, z8.s\n"
+ "fmla z29.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z8.s\n"
+ "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z5.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "fmla z31.s, p3/M, z3.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p3/M, z3.s, z6.s\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z5.s\n"
+ "fmla z28.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z6.s\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "fmla z28.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z1.s }, p3/Z, [x8]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z5.s\n"
+ "fmla z28.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "fmla z28.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z14.s\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z30.s, p3/M, z1.s, z5.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "fmla z30.s, p3/M, z2.s, z6.s\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "fmla z30.s, p3/M, z3.s, z8.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z9.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x11]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z29.s }, p0, [x22]\n"
+ "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..6e35ee86c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ptrue p3.b\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntb x14, ALL, MUL #2\n"
+ "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x19, #0x0]\n"
+ "cntw x10\n"
+ "ldp x9, x28, [x19, #0x10]\n"
+ "sub x27, XZR, x10\n"
+ "ld1w { z16.s }, p3/Z, [x16]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "cmp x10, %x[n_channels]\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "addvl x16, x16, #6\n"
+ "ldp x26, x25, [x15, #0x0]\n"
+ "ldp x24, x23, [x15, #0x10]\n"
+ "ldp x20, x19, [x15, #0x20]\n"
+ "ld1w { z5.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "ld1w { z6.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "ld1w { z7.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "ld1w { z8.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "ldp x22, x21, [x15, #0x30]\n"
+ "ldp x20, x19, [x15, #0x40]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z14.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ldr x21, [x15, #0x50]\n"
+ "whilelt p1.s, x10, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "ldr x19, [x15, #0x58]\n"
+ "incw x27\n"
+ "fmla z29.s, p3/M, z0.s, z7.s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z8.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x16]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ldr x19, [x15, #0x68]\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z5.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z9.s\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z29.s, p3/M, z4.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ldr x26, [x15, #0x80]\n"
+ "fmla z31.s, p3/M, z0.s, z7.s\n"
+ "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z8.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z0.s, z14.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z8.s\n"
+ "ldr x25, [x15, #0x88]\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "ldr x24, [x15, #0x90]\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z5.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x15, #0x98]\n"
+ "fmla z31.s, p3/M, z3.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla z31.s, p3/M, z4.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "addvl x16, x16, #16\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z0.s, z14.s\n"
+ "ldr x19, [x15, #0xa8]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmla z29.s, p3/M, z0.s, z5.s\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla z28.s, p3/M, z0.s, z6.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ldr x19, [x15, #0xc8]\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "ldr x19, [x15, #0xd8]\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z5.s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ldr x19, [x15, #0xe8]\n"
+ "fmla z29.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z6.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z8.s\n"
+ "ldr x20, [x15, #0xf0]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ldr x19, [x15, #0xf8]\n"
+ "fmla z30.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z1.s }, p3/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ldr x26, [x15, #0x100]\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "ldr x25, [x15, #0x108]\n"
+ "fmla z30.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z30.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x24, [x15, #0x110]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ldr x23, [x15, #0x118]\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "fmla z30.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "incw x13\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "ldp x26, x25, [x15, #0x0]\n"
+ "whilelt p2.s, x13, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ldp x24, x23, [x15, #0x10]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ldp x20, x19, [x15, #0x20]\n"
+ "ldp x22, x21, [x15, #0x30]\n"
+ "fmla z28.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z5.s }, p1/Z, [x26, x10, LSL #2]\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "ld1w { z6.s }, p1/Z, [x25, x10, LSL #2]\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "ld1w { z7.s }, p1/Z, [x24, x10, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "ld1w { z8.s }, p1/Z, [x23, x10, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "ld1w { z9.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z13.s }, p1/Z, [x19, x10, LSL #2]\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x10, LSL #2]\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "ld1w { z12.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "ldp x20, x19, [x15, #0x40]\n"
+ "st1w { z31.s }, p0, [x12, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z14.s }, p1/Z, [x19, x10, LSL #2]\n"
+ "incw x10\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "cmp x10, %x[n_channels]\n"
+ "st1w { z29.s }, p0, [x9, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x28, x27, LSL #2]\n"
+ "ld1w { z16.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "mov z31.d, z16.d\n"
+ "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "mov z30.d, z16.d\n"
+ "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "mov z29.d, z16.d\n"
+ "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "mov z28.d, z16.d\n"
+ "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ldr x21, [x15, #0x50]\n"
+ "incw x27\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "ldr x19, [x15, #0x58]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z29.s, p3/M, z0.s, z7.s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z8.s\n"
+ "fmla z28.s, p3/M, z1.s, z13.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z0.s }, p3/Z, [x16]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ldr x19, [x15, #0x68]\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "fmla z28.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z5.s\n"
+ "fmla z28.s, p3/M, z3.s, z6.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z9.s\n"
+ "ldr x19, [x15, #0x78]\n"
+ "fmla z29.s, p3/M, z4.s, z6.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "ldr x26, [x15, #0x80]\n"
+ "fmla z31.s, p3/M, z0.s, z7.s\n"
+ "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z8.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z8.s\n"
+ "ldr x25, [x15, #0x88]\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z5.s\n"
+ "ldr x24, [x15, #0x90]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x15, #0x98]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "fmla z31.s, p3/M, z3.s, z5.s\n"
+ "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z6.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla z31.s, p3/M, z4.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "addvl x16, x16, #16\n"
+ "fmla z28.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z0.s, z14.s\n"
+ "ldr x19, [x15, #0xa8]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmla z29.s, p3/M, z0.s, z5.s\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla z28.s, p3/M, z0.s, z6.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n"
+ "fmla z29.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z30.s, p3/M, z3.s, z13.s\n"
+ "ldr x19, [x15, #0xc8]\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z13.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla z31.s, p3/M, z0.s, z5.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z0.s, z6.s\n"
+ "ldr x19, [x15, #0xd8]\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla z31.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z10.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n"
+ "fmla z28.s, p3/M, z1.s, z5.s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ldr x19, [x15, #0xe8]\n"
+ "fmla z29.s, p3/M, z2.s, z5.s\n"
+ "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z6.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n"
+ "fmla z28.s, p3/M, z3.s, z8.s\n"
+ "ldr x20, [x15, #0xf0]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "ldr x19, [x15, #0xf8]\n"
+ "fmla z30.s, p3/M, z4.s, z14.s\n"
+ "ld1w { z1.s }, p3/Z, [x16]\n"
+ "fmla z29.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "prfm pldl1keep, [x19, x14]\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ldr x26, [x15, #0x100]\n"
+ "fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "ldr x25, [x15, #0x108]\n"
+ "fmla z30.s, p3/M, z1.s, z5.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "prfm pldl1keep, [x26, x14]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z5.s\n"
+ "prfm pldl1keep, [x25, x14]\n"
+ "fmla z30.s, p3/M, z2.s, z6.s\n"
+ "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x24, [x15, #0x110]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ldr x23, [x15, #0x118]\n"
+ "fmla z31.s, p3/M, z3.s, z6.s\n"
+ "fmla z30.s, p3/M, z3.s, z8.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "prfm pldl1keep, [x24, x14]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla z31.s, p3/M, z4.s, z8.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z9.s\n"
+ "fmax z31.s, p3/M, z31.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z18.s\n"
+ "fmin z31.s, p3/M, z31.s, z17.s\n"
+ "st1w { z31.s }, p0, [x12, x27, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z17.s\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "fmax z28.s, p3/M, z28.s, z18.s\n"
+ "st1w { z29.s }, p0, [x9, x27, LSL #2]\n"
+ "fmin z28.s, p3/M, z28.s, z17.s\n"
+ "st1w { z28.s }, p0, [x28, x27, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dd2c519e3a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_generic_output9_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+ sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..370218e1d4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ "ld1rw { z4.s }, p1/Z, [%x[minmax_vals]]\n"
+ "mov x28, #0x0\n"
+ "ld1rw { z3.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "1:" // Channel loop
+ "mov z2.b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ld1w { z2.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov z1.d, z2.d\n"
+ "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z31.d, z2.d\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "subs x21, %x[n_points], #0x1\n"
+ "mov z30.d, z2.d\n"
+ "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "mov z28.d, z2.d\n"
+ "addvl %x[params], %x[params], #1\n"
+ "mov z27.d, z2.d\n"
+ "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "mov z25.d, z2.d\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "mov z24.d, z2.d\n"
+ "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "mov z22.d, z2.d\n"
+ "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ldr x19, [x22], #0x8\n"
+ "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "fmla z2.s, p1/M, z29.s, z0.s\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "fmla z1.s, p1/M, z26.s, z0.s\n"
+ "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "fmla z31.s, p1/M, z23.s, z0.s\n"
+ "fmla z30.s, p1/M, z21.s, z0.s\n"
+ "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "fmla z28.s, p1/M, z20.s, z0.s\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "fmla z27.s, p1/M, z19.s, z0.s\n"
+ "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "fmla z25.s, p1/M, z18.s, z0.s\n"
+ "fmla z24.s, p1/M, z17.s, z0.s\n"
+ "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "fmla z22.s, p1/M, z16.s, z0.s\n"
+ "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "addvl %x[params], %x[params], #1\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ldp x20, x19, [x22], #0x10\n"
+ "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "ldr x19, [x22], #0x8\n"
+ "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla z2.s, p1/M, z29.s, z0.s\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "fmla z1.s, p1/M, z26.s, z0.s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "fmla z31.s, p1/M, z23.s, z0.s\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "fmla z30.s, p1/M, z21.s, z0.s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "fmla z28.s, p1/M, z20.s, z0.s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmla z27.s, p1/M, z19.s, z0.s\n"
+ "fmla z25.s, p1/M, z18.s, z0.s\n"
+ "fmla z24.s, p1/M, z17.s, z0.s\n"
+ "fmla z22.s, p1/M, z16.s, z0.s\n"
+ "fmax z2.s, p1/M, z2.s, z4.s\n"
+ "fmax z1.s, p1/M, z1.s, z4.s\n"
+ "fmax z31.s, p1/M, z31.s, z4.s\n"
+ "fmax z30.s, p1/M, z30.s, z4.s\n"
+ "fmin z2.s, p1/M, z2.s, z3.s\n"
+ "st1w { z2.s }, p0, [x27, x28, LSL #2]\n"
+ "fmin z1.s, p1/M, z1.s, z3.s\n"
+ "fmin z31.s, p1/M, z31.s, z3.s\n"
+ "st1w { z1.s }, p0, [x26, x28, LSL #2]\n"
+ "fmin z30.s, p1/M, z30.s, z3.s\n"
+ "fmax z28.s, p1/M, z28.s, z4.s\n"
+ "st1w { z31.s }, p0, [x25, x28, LSL #2]\n"
+ "fmax z27.s, p1/M, z27.s, z4.s\n"
+ "st1w { z30.s }, p0, [x24, x28, LSL #2]\n"
+ "fmin z28.s, p1/M, z28.s, z3.s\n"
+ "fmax z25.s, p1/M, z25.s, z4.s\n"
+ "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+ "fmin z27.s, p1/M, z27.s, z3.s\n"
+ "fmin z25.s, p1/M, z25.s, z3.s\n"
+ "st1w { z27.s }, p0, [x22, x28, LSL #2]\n"
+ "fmax z24.s, p1/M, z24.s, z4.s\n"
+ "fmax z22.s, p1/M, z22.s, z4.s\n"
+ "st1w { z25.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z24.s, p1/M, z24.s, z3.s\n"
+ "st1w { z24.s }, p0, [x20, x28, LSL #2]\n"
+ "fmin z22.s, p1/M, z22.s, z3.s\n"
+ "st1w { z22.s }, p0, [x19, x28, LSL #2]\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5cf3314c65
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 7;
+ constexpr static unsigned int input_cols = 7;
+ constexpr static unsigned int input_col_quads = 2;
+
+ kern_type kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+
+ sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ce640a207d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x20]\n"
+ "mov x25, #0x0\n"
+ "ldp x24, x23, [%x[outptrs], #0x30]\n"
+ "whilelt p1.s, x28, %x[channel_multiplier]\n"
+ "ldr x22, [%x[outptrs], #0x40]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "ld1rqw { z2.s }, p2/Z, [x21]\n"
+ "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
+ "ld1rqw { z4.s }, p2/Z, [x20]\n"
+ "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
+ "ld1rqw { z6.s }, p2/Z, [x19]\n"
+ "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "ld1rqw { z8.s }, p2/Z, [x21]\n"
+ "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
+ "ld1rqw { z10.s }, p2/Z, [x20]\n"
+ "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
+ "ld1rqw { z12.s }, p2/Z, [x19]\n"
+ "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
+ "ldr x19, [%x[inptrs], #0x30]\n"
+ "ld1rw { z26.s }, p2/Z, [%x[clamps]]\n"
+ "ld1rw { z25.s }, p2/Z, [%x[clamps], #4]\n"
+ "ld1rqw { z14.s }, p2/Z, [x19]\n"
+ "ld1rqw { z15.s }, p2/Z, [x19, #16]\n"
+ "ld1w { z24.s }, p1/Z, [%x[params]]\n"
+ "mov z23.d, z24.d\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z22.d, z24.d\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "mov z21.d, z24.d\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z20.d, z24.d\n"
+ "mov z19.d, z24.d\n"
+ "mov z18.d, z24.d\n"
+ "mov z17.d, z24.d\n"
+ "mov z16.d, z24.d\n"
+ "1:" // Output channel complete vector loop
+ "mov z0.d, z10.d\n"
+ "mov p0.b, p1.b\n"
+ "mov z1.d, z11.d\n"
+ "incw x28\n"
+ "fmla z24.s, z31.s, z2.s[0]\n"
+ "whilelt p1.s, x28, %x[channel_multiplier]\n"
+ "fmla z23.s, z31.s, z2.s[2]\n"
+ "fmla z22.s, z31.s, z3.s[0]\n"
+ "fmla z21.s, z31.s, z6.s[0]\n"
+ "fmla z20.s, z31.s, z6.s[2]\n"
+ "fmla z19.s, z31.s, z7.s[0]\n"
+ "fmla z18.s, z31.s, z0.s[0]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z16.s, z31.s, z1.s[0]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+ "fmla z24.s, z30.s, z2.s[1]\n"
+ "fmla z23.s, z30.s, z2.s[3]\n"
+ "fmla z22.s, z30.s, z3.s[1]\n"
+ "fmla z21.s, z30.s, z6.s[1]\n"
+ "fmla z20.s, z30.s, z6.s[3]\n"
+ "fmla z19.s, z30.s, z7.s[1]\n"
+ "fmla z18.s, z30.s, z0.s[1]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z16.s, z30.s, z1.s[1]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z24.s, z29.s, z2.s[2]\n"
+ "fmla z23.s, z29.s, z3.s[0]\n"
+ "fmla z22.s, z29.s, z3.s[2]\n"
+ "fmla z21.s, z29.s, z6.s[2]\n"
+ "fmla z20.s, z29.s, z7.s[0]\n"
+ "fmla z19.s, z29.s, z7.s[2]\n"
+ "fmla z18.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[2]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z24.s, z31.s, z4.s[0]\n"
+ "fmla z23.s, z31.s, z4.s[2]\n"
+ "fmla z22.s, z31.s, z5.s[0]\n"
+ "fmla z21.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z19.s, z31.s, z1.s[0]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z18.s, z31.s, z0.s[0]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z16.s, z31.s, z1.s[0]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z24.s, z30.s, z4.s[1]\n"
+ "fmla z23.s, z30.s, z4.s[3]\n"
+ "fmla z22.s, z30.s, z5.s[1]\n"
+ "fmla z21.s, z30.s, z0.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z19.s, z30.s, z1.s[1]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z18.s, z30.s, z0.s[1]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z16.s, z30.s, z1.s[1]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z24.s, z29.s, z4.s[2]\n"
+ "fmla z23.s, z29.s, z5.s[0]\n"
+ "fmla z22.s, z29.s, z5.s[2]\n"
+ "fmla z21.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z20.s, z29.s, z1.s[0]\n"
+ "fmla z19.s, z29.s, z1.s[2]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z18.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[2]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z24.s, z31.s, z6.s[0]\n"
+ "fmla z23.s, z31.s, z6.s[2]\n"
+ "fmla z22.s, z31.s, z7.s[0]\n"
+ "fmla z21.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z14.d\n"
+ "fmla z19.s, z31.s, z1.s[0]\n"
+ "mov z1.d, z15.d\n"
+ "fmla z18.s, z31.s, z0.s[0]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z16.s, z31.s, z1.s[0]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z24.s, z30.s, z6.s[1]\n"
+ "fmla z23.s, z30.s, z6.s[3]\n"
+ "fmla z22.s, z30.s, z7.s[1]\n"
+ "fmla z21.s, z30.s, z0.s[1]\n"
+ "fmla z20.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z14.d\n"
+ "fmla z19.s, z30.s, z1.s[1]\n"
+ "mov z1.d, z15.d\n"
+ "fmla z18.s, z30.s, z0.s[1]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z16.s, z30.s, z1.s[1]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z24.s, z29.s, z6.s[2]\n"
+ "fmla z23.s, z29.s, z7.s[0]\n"
+ "fmla z22.s, z29.s, z7.s[2]\n"
+ "fmla z21.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z14.d\n"
+ "fmla z20.s, z29.s, z1.s[0]\n"
+ "fmla z19.s, z29.s, z1.s[2]\n"
+ "mov z1.d, z15.d\n"
+ "fmla z18.s, z29.s, z0.s[2]\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[2]\n"
+ "fmin z24.s, p2/M, z24.s, z25.s\n"
+ "fmin z23.s, p2/M, z23.s, z25.s\n"
+ "fmin z22.s, p2/M, z22.s, z25.s\n"
+ "fmin z21.s, p2/M, z21.s, z25.s\n"
+ "fmax z24.s, p2/M, z24.s, z26.s\n"
+ "st1w { z24.s }, p0, [x12, x25, LSL #2]\n"
+ "fmax z23.s, p2/M, z23.s, z26.s\n"
+ "fmax z22.s, p2/M, z22.s, z26.s\n"
+ "ld1w { z24.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmax z21.s, p2/M, z21.s, z26.s\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmin z20.s, p2/M, z20.s, z25.s\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "fmin z19.s, p2/M, z19.s, z25.s\n"
+ "st1w { z23.s }, p0, [x11, x25, LSL #2]\n"
+ "mov z23.d, z24.d\n"
+ "st1w { z22.s }, p0, [x10, x25, LSL #2]\n"
+ "mov z22.d, z24.d\n"
+ "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
+ "mov z21.d, z24.d\n"
+ "fmax z20.s, p2/M, z20.s, z26.s\n"
+ "st1w { z20.s }, p0, [x27, x25, LSL #2]\n"
+ "mov z20.d, z24.d\n"
+ "fmax z19.s, p2/M, z19.s, z26.s\n"
+ "st1w { z19.s }, p0, [x26, x25, LSL #2]\n"
+ "mov z19.d, z24.d\n"
+ "fmin z18.s, p2/M, z18.s, z25.s\n"
+ "fmin z17.s, p2/M, z17.s, z25.s\n"
+ "fmin z16.s, p2/M, z16.s, z25.s\n"
+ "fmax z18.s, p2/M, z18.s, z26.s\n"
+ "st1w { z18.s }, p0, [x24, x25, LSL #2]\n"
+ "mov z18.d, z24.d\n"
+ "fmax z17.s, p2/M, z17.s, z26.s\n"
+ "st1w { z17.s }, p0, [x23, x25, LSL #2]\n"
+ "mov z17.d, z24.d\n"
+ "fmax z16.s, p2/M, z16.s, z26.s\n"
+ "st1w { z16.s }, p0, [x22, x25, LSL #2]\n"
+ "mov z16.d, z24.d\n"
+ "incw x25\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3c2f77156d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 8;
+ constexpr static unsigned int input_col_quads = 2;
+
+ kern_type kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+
+ sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..453b00c0db
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "mov x27, #0x0\n"
+ "ldp x26, x25, [%x[outptrs], #0x20]\n"
+ "mov x24, #0x0\n"
+ "ldp x23, x22, [%x[outptrs], #0x30]\n"
+ "whilelt p1.s, x27, %x[channel_multiplier]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr x19, [%x[inptrs], #0x10]\n"
+ "ld1rqw { z2.s }, p2/Z, [x21]\n"
+ "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
+ "ld1rqw { z4.s }, p2/Z, [x20]\n"
+ "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
+ "ld1rqw { z6.s }, p2/Z, [x19]\n"
+ "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr x19, [%x[inptrs], #0x28]\n"
+ "ld1rqw { z8.s }, p2/Z, [x21]\n"
+ "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
+ "ld1rqw { z10.s }, p2/Z, [x20]\n"
+ "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
+ "ld1rqw { z12.s }, p2/Z, [x19]\n"
+ "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
+ "ld1rw { z25.s }, p2/Z, [%x[clamps]]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[clamps], #4]\n"
+ "ld1w { z23.s }, p1/Z, [%x[params]]\n"
+ "mov z22.d, z23.d\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z21.d, z23.d\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "mov z20.d, z23.d\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z19.d, z23.d\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "mov z18.d, z23.d\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ "mov z17.d, z23.d\n"
+ "mov z16.d, z23.d\n"
+ "1:" // Output channel complete vector loop
+ "mov z0.d, z8.d\n"
+ "mov p0.b, p1.b\n"
+ "mov z1.d, z9.d\n"
+ "incw x27\n"
+ "fmla z23.s, z31.s, z2.s[0]\n"
+ "whilelt p1.s, x27, %x[channel_multiplier]\n"
+ "fmla z22.s, z31.s, z2.s[1]\n"
+ "fmla z21.s, z31.s, z2.s[2]\n"
+ "fmla z20.s, z31.s, z2.s[3]\n"
+ "fmla z19.s, z31.s, z4.s[0]\n"
+ "fmla z18.s, z31.s, z4.s[1]\n"
+ "fmla z17.s, z31.s, z4.s[2]\n"
+ "fmla z16.s, z31.s, z4.s[3]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+ "fmla z23.s, z30.s, z2.s[1]\n"
+ "fmla z22.s, z30.s, z2.s[2]\n"
+ "fmla z21.s, z30.s, z2.s[3]\n"
+ "fmla z20.s, z30.s, z3.s[0]\n"
+ "fmla z19.s, z30.s, z4.s[1]\n"
+ "fmla z18.s, z30.s, z4.s[2]\n"
+ "fmla z17.s, z30.s, z4.s[3]\n"
+ "fmla z16.s, z30.s, z5.s[0]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z23.s, z29.s, z2.s[2]\n"
+ "fmla z22.s, z29.s, z2.s[3]\n"
+ "fmla z21.s, z29.s, z3.s[0]\n"
+ "fmla z20.s, z29.s, z3.s[1]\n"
+ "fmla z19.s, z29.s, z4.s[2]\n"
+ "fmla z18.s, z29.s, z4.s[3]\n"
+ "fmla z17.s, z29.s, z5.s[0]\n"
+ "fmla z16.s, z29.s, z5.s[1]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z23.s, z28.s, z2.s[3]\n"
+ "fmla z22.s, z28.s, z3.s[0]\n"
+ "fmla z21.s, z28.s, z3.s[1]\n"
+ "fmla z20.s, z28.s, z3.s[2]\n"
+ "fmla z19.s, z28.s, z4.s[3]\n"
+ "fmla z18.s, z28.s, z5.s[0]\n"
+ "fmla z17.s, z28.s, z5.s[1]\n"
+ "fmla z16.s, z28.s, z5.s[2]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z23.s, z27.s, z3.s[0]\n"
+ "fmla z22.s, z27.s, z3.s[1]\n"
+ "fmla z21.s, z27.s, z3.s[2]\n"
+ "fmla z20.s, z27.s, z3.s[3]\n"
+ "fmla z19.s, z27.s, z5.s[0]\n"
+ "fmla z18.s, z27.s, z5.s[1]\n"
+ "fmla z17.s, z27.s, z5.s[2]\n"
+ "fmla z16.s, z27.s, z5.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z23.s, z31.s, z4.s[0]\n"
+ "fmla z22.s, z31.s, z4.s[1]\n"
+ "fmla z21.s, z31.s, z4.s[2]\n"
+ "fmla z20.s, z31.s, z4.s[3]\n"
+ "fmla z19.s, z31.s, z6.s[0]\n"
+ "fmla z18.s, z31.s, z6.s[1]\n"
+ "fmla z17.s, z31.s, z6.s[2]\n"
+ "fmla z16.s, z31.s, z6.s[3]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z23.s, z30.s, z4.s[1]\n"
+ "fmla z22.s, z30.s, z4.s[2]\n"
+ "fmla z21.s, z30.s, z4.s[3]\n"
+ "fmla z20.s, z30.s, z5.s[0]\n"
+ "fmla z19.s, z30.s, z6.s[1]\n"
+ "fmla z18.s, z30.s, z6.s[2]\n"
+ "fmla z17.s, z30.s, z6.s[3]\n"
+ "fmla z16.s, z30.s, z7.s[0]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z23.s, z29.s, z4.s[2]\n"
+ "fmla z22.s, z29.s, z4.s[3]\n"
+ "fmla z21.s, z29.s, z5.s[0]\n"
+ "fmla z20.s, z29.s, z5.s[1]\n"
+ "fmla z19.s, z29.s, z6.s[2]\n"
+ "fmla z18.s, z29.s, z6.s[3]\n"
+ "fmla z17.s, z29.s, z7.s[0]\n"
+ "fmla z16.s, z29.s, z7.s[1]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmla z23.s, z28.s, z4.s[3]\n"
+ "fmla z22.s, z28.s, z5.s[0]\n"
+ "fmla z21.s, z28.s, z5.s[1]\n"
+ "fmla z20.s, z28.s, z5.s[2]\n"
+ "fmla z19.s, z28.s, z6.s[3]\n"
+ "fmla z18.s, z28.s, z7.s[0]\n"
+ "fmla z17.s, z28.s, z7.s[1]\n"
+ "fmla z16.s, z28.s, z7.s[2]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z23.s, z27.s, z5.s[0]\n"
+ "fmla z22.s, z27.s, z5.s[1]\n"
+ "fmla z21.s, z27.s, z5.s[2]\n"
+ "fmla z20.s, z27.s, z5.s[3]\n"
+ "fmla z19.s, z27.s, z7.s[0]\n"
+ "fmla z18.s, z27.s, z7.s[1]\n"
+ "fmla z17.s, z27.s, z7.s[2]\n"
+ "fmla z16.s, z27.s, z7.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "fmla z23.s, z31.s, z6.s[0]\n"
+ "fmla z22.s, z31.s, z6.s[1]\n"
+ "fmla z21.s, z31.s, z6.s[2]\n"
+ "fmla z20.s, z31.s, z6.s[3]\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z18.s, z31.s, z0.s[1]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z16.s, z31.s, z0.s[3]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z23.s, z30.s, z6.s[1]\n"
+ "fmla z22.s, z30.s, z6.s[2]\n"
+ "fmla z21.s, z30.s, z6.s[3]\n"
+ "fmla z20.s, z30.s, z7.s[0]\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "fmla z18.s, z30.s, z0.s[2]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "fmla z16.s, z30.s, z1.s[0]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z23.s, z29.s, z6.s[2]\n"
+ "fmla z22.s, z29.s, z6.s[3]\n"
+ "fmla z21.s, z29.s, z7.s[0]\n"
+ "fmla z20.s, z29.s, z7.s[1]\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmla z18.s, z29.s, z0.s[3]\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[1]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z23.s, z28.s, z6.s[3]\n"
+ "fmla z22.s, z28.s, z7.s[0]\n"
+ "fmla z21.s, z28.s, z7.s[1]\n"
+ "fmla z20.s, z28.s, z7.s[2]\n"
+ "fmla z19.s, z28.s, z0.s[3]\n"
+ "fmla z18.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z1.s[1]\n"
+ "fmla z16.s, z28.s, z1.s[2]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z23.s, z27.s, z7.s[0]\n"
+ "fmla z22.s, z27.s, z7.s[1]\n"
+ "fmla z21.s, z27.s, z7.s[2]\n"
+ "fmla z20.s, z27.s, z7.s[3]\n"
+ "fmla z19.s, z27.s, z1.s[0]\n"
+ "fmla z18.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z16.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "fmla z23.s, z31.s, z0.s[0]\n"
+ "fmla z22.s, z31.s, z0.s[1]\n"
+ "fmla z21.s, z31.s, z0.s[2]\n"
+ "fmla z20.s, z31.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z18.s, z31.s, z0.s[1]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z16.s, z31.s, z0.s[3]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z23.s, z30.s, z0.s[1]\n"
+ "fmla z22.s, z30.s, z0.s[2]\n"
+ "fmla z21.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z20.s, z30.s, z1.s[0]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "fmla z18.s, z30.s, z0.s[2]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z16.s, z30.s, z1.s[0]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params]]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z23.s, z29.s, z0.s[2]\n"
+ "fmla z22.s, z29.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z21.s, z29.s, z1.s[0]\n"
+ "fmla z20.s, z29.s, z1.s[1]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmla z18.s, z29.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[1]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z23.s, z28.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z22.s, z28.s, z1.s[0]\n"
+ "fmla z21.s, z28.s, z1.s[1]\n"
+ "fmla z20.s, z28.s, z1.s[2]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z19.s, z28.s, z0.s[3]\n"
+ "fmla z18.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z1.s[1]\n"
+ "fmla z16.s, z28.s, z1.s[2]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z23.s, z27.s, z1.s[0]\n"
+ "fmla z22.s, z27.s, z1.s[1]\n"
+ "fmla z21.s, z27.s, z1.s[2]\n"
+ "fmla z20.s, z27.s, z1.s[3]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z19.s, z27.s, z1.s[0]\n"
+ "fmla z18.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z16.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z23.s, z31.s, z0.s[0]\n"
+ "fmla z22.s, z31.s, z0.s[1]\n"
+ "fmla z21.s, z31.s, z0.s[2]\n"
+ "fmla z20.s, z31.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z18.s, z31.s, z0.s[1]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z16.s, z31.s, z0.s[3]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z23.s, z30.s, z0.s[1]\n"
+ "fmla z22.s, z30.s, z0.s[2]\n"
+ "fmla z21.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z20.s, z30.s, z1.s[0]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "fmla z18.s, z30.s, z0.s[2]\n"
+ "fmla z17.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z16.s, z30.s, z1.s[0]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z23.s, z29.s, z0.s[2]\n"
+ "fmla z22.s, z29.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z21.s, z29.s, z1.s[0]\n"
+ "fmla z20.s, z29.s, z1.s[1]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmla z18.s, z29.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[1]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z23.s, z28.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z22.s, z28.s, z1.s[0]\n"
+ "fmla z21.s, z28.s, z1.s[1]\n"
+ "fmla z20.s, z28.s, z1.s[2]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z19.s, z28.s, z0.s[3]\n"
+ "fmla z18.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z1.s[1]\n"
+ "fmla z16.s, z28.s, z1.s[2]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z23.s, z27.s, z1.s[0]\n"
+ "fmla z22.s, z27.s, z1.s[1]\n"
+ "fmla z21.s, z27.s, z1.s[2]\n"
+ "fmla z20.s, z27.s, z1.s[3]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z19.s, z27.s, z1.s[0]\n"
+ "fmla z18.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z16.s, z27.s, z1.s[3]\n"
+ "fmin z23.s, p2/M, z23.s, z24.s\n"
+ "fmin z22.s, p2/M, z22.s, z24.s\n"
+ "fmin z21.s, p2/M, z21.s, z24.s\n"
+ "fmin z20.s, p2/M, z20.s, z24.s\n"
+ "fmax z23.s, p2/M, z23.s, z25.s\n"
+ "st1w { z23.s }, p0, [x11, x24, LSL #2]\n"
+ "fmax z22.s, p2/M, z22.s, z25.s\n"
+ "fmax z21.s, p2/M, z21.s, z25.s\n"
+ "ld1w { z23.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmax z20.s, p2/M, z20.s, z25.s\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmin z19.s, p2/M, z19.s, z24.s\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "fmin z18.s, p2/M, z18.s, z24.s\n"
+ "st1w { z22.s }, p0, [x10, x24, LSL #2]\n"
+ "mov z22.d, z23.d\n"
+ "st1w { z21.s }, p0, [x9, x24, LSL #2]\n"
+ "mov z21.d, z23.d\n"
+ "st1w { z20.s }, p0, [x28, x24, LSL #2]\n"
+ "mov z20.d, z23.d\n"
+ "fmax z19.s, p2/M, z19.s, z25.s\n"
+ "st1w { z19.s }, p0, [x26, x24, LSL #2]\n"
+ "mov z19.d, z23.d\n"
+ "fmax z18.s, p2/M, z18.s, z25.s\n"
+ "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+ "mov z18.d, z23.d\n"
+ "fmin z17.s, p2/M, z17.s, z24.s\n"
+ "fmin z16.s, p2/M, z16.s, z24.s\n"
+ "fmax z17.s, p2/M, z17.s, z25.s\n"
+ "st1w { z17.s }, p0, [x23, x24, LSL #2]\n"
+ "mov z17.d, z23.d\n"
+ "fmax z16.s, p2/M, z16.s, z25.s\n"
+ "st1w { z16.s }, p0, [x22, x24, LSL #2]\n"
+ "mov z16.d, z23.d\n"
+ "incw x24\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7a4bd1dd1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+ typedef float bias_type;
+ typedef float input_type;
+ typedef float weight_type;
+ typedef float return_type;
+
+ typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int output_rows(void) { return 2; };
+ constexpr static unsigned int output_cols(void) { return 8; };
+
+ constexpr static unsigned int output_col_regs(void) { return 2; };
+
+ kern_type kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+ sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0124370067
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const float *weights,
+ const float *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ "ld1rw { z11.s }, p1/Z, [%x[minmax_vals]]\n"
+ "mov x28, #0x0\n"
+ "ld1rw { z10.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x28, %x[n_output_channels]\n"
+ "1:" // Output channel loop
+ "mov z16.b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ld1w { z16.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov z9.d, z16.d\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov z31.d, z16.d\n"
+ "ldp x24, x27, [x20], #0x10\n"
+ "lsr x19, %x[kernel_points], #0x1\n"
+ "mov z30.d, z16.d\n"
+ "ld1rqw { z7.s }, p1/Z, [x24]\n"
+ "mov z29.d, z16.d\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "mov z28.d, z16.d\n"
+ "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+ "mov z27.d, z16.d\n"
+ "ld1rqw { z5.s }, p1/Z, [x27]\n"
+ "mov z26.d, z16.d\n"
+ "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+ "mov z25.d, z16.d\n"
+ "mov z24.d, z16.d\n"
+ "mov z23.d, z16.d\n"
+ "mov z22.d, z16.d\n"
+ "mov z21.d, z16.d\n"
+ "mov z20.d, z16.d\n"
+ "mov z19.d, z16.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z17.d, z16.d\n"
+ "cbz x19, 6f\n"
+ "ldp x24, x27, [x20], #0x10\n"
+ "ld1w { z16.s }, p1/Z, [%x[weights]]\n"
+ "subs x19, x19, #0x1\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "ld1rqw { z3.s }, p1/Z, [x24]\n"
+ "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
+ "ld1rqw { z1.s }, p1/Z, [x27]\n"
+ "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "fmla z9.s, z8.s, z7.s[0]\n"
+ "ldp x24, x27, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "fmla z31.s, z8.s, z7.s[1]\n"
+ "fmla z30.s, z8.s, z7.s[2]\n"
+ "fmla z29.s, z8.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x24]\n"
+ "fmla z28.s, z8.s, z6.s[0]\n"
+ "fmla z27.s, z8.s, z6.s[1]\n"
+ "fmla z26.s, z8.s, z6.s[2]\n"
+ "fmla z25.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+ "fmla z24.s, z8.s, z5.s[0]\n"
+ "fmla z23.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z21.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x27]\n"
+ "fmla z20.s, z8.s, z4.s[0]\n"
+ "fmla z19.s, z8.s, z4.s[1]\n"
+ "fmla z18.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z8.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+ "fmla z9.s, z16.s, z3.s[0]\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "fmla z31.s, z16.s, z3.s[1]\n"
+ "ldp x24, x27, [x20], #0x10\n"
+ "fmla z30.s, z16.s, z3.s[2]\n"
+ "fmla z29.s, z16.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x24]\n"
+ "fmla z28.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z2.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z25.s, z16.s, z2.s[3]\n"
+ "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
+ "fmla z24.s, z16.s, z1.s[0]\n"
+ "fmla z23.s, z16.s, z1.s[1]\n"
+ "fmla z22.s, z16.s, z1.s[2]\n"
+ "fmla z21.s, z16.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x27]\n"
+ "fmla z20.s, z16.s, z0.s[0]\n"
+ "fmla z19.s, z16.s, z0.s[1]\n"
+ "fmla z18.s, z16.s, z0.s[2]\n"
+ "fmla z17.s, z16.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
+ "ld1w { z16.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+ "addvl %x[weights], %x[weights], #2\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla z9.s, z8.s, z7.s[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla z31.s, z8.s, z7.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla z30.s, z8.s, z7.s[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla z29.s, z8.s, z7.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z6.s[0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z27.s, z8.s, z6.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla z26.s, z8.s, z6.s[2]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla z25.s, z8.s, z6.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z5.s[0]\n"
+ "fmla z23.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z21.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z4.s[0]\n"
+ "fmla z19.s, z8.s, z4.s[1]\n"
+ "fmla z18.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z8.s, z4.s[3]\n"
+ "fmla z9.s, z16.s, z3.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[1]\n"
+ "fmla z30.s, z16.s, z3.s[2]\n"
+ "fmla z29.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z2.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z25.s, z16.s, z2.s[3]\n"
+ "fmla z24.s, z16.s, z1.s[0]\n"
+ "fmla z23.s, z16.s, z1.s[1]\n"
+ "fmla z22.s, z16.s, z1.s[2]\n"
+ "fmla z21.s, z16.s, z1.s[3]\n"
+ "fmla z20.s, z16.s, z0.s[0]\n"
+ "fmla z19.s, z16.s, z0.s[1]\n"
+ "fmla z18.s, z16.s, z0.s[2]\n"
+ "fmla z17.s, z16.s, z0.s[3]\n"
+ "fmin z9.s, p1/M, z9.s, z10.s\n"
+ "fmin z31.s, p1/M, z31.s, z10.s\n"
+ "fmin z30.s, p1/M, z30.s, z10.s\n"
+ "fmin z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z9.s, p1/M, z9.s, z11.s\n"
+ "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z31.s, p1/M, z31.s, z11.s\n"
+ "fmax z30.s, p1/M, z30.s, z11.s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmax z29.s, p1/M, z29.s, z11.s\n"
+ "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+ "fmin z28.s, p1/M, z28.s, z10.s\n"
+ "fmin z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z26.s, p1/M, z26.s, z10.s\n"
+ "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z25.s, p1/M, z25.s, z10.s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin z24.s, p1/M, z24.s, z10.s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax z28.s, p1/M, z28.s, z11.s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax z27.s, p1/M, z27.s, z11.s\n"
+ "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z11.s\n"
+ "fmax z25.s, p1/M, z25.s, z11.s\n"
+ "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z24.s, p1/M, z24.s, z11.s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z23.s, p1/M, z23.s, z10.s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmin z22.s, p1/M, z22.s, z10.s\n"
+ "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+ "fmin z21.s, p1/M, z21.s, z10.s\n"
+ "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+ "fmin z20.s, p1/M, z20.s, z10.s\n"
+ "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z23.s, p1/M, z23.s, z11.s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax z22.s, p1/M, z22.s, z11.s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax z21.s, p1/M, z21.s, z11.s\n"
+ "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+ "fmax z20.s, p1/M, z20.s, z11.s\n"
+ "fmin z19.s, p1/M, z19.s, z10.s\n"
+ "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z18.s, p1/M, z18.s, z10.s\n"
+ "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z17.s, p1/M, z17.s, z10.s\n"
+ "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z19.s, p1/M, z19.s, z11.s\n"
+ "fmax z18.s, p1/M, z18.s, z11.s\n"
+ "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z17.s, p1/M, z17.s, z11.s\n"
+ "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+ "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla z9.s, z8.s, z7.s[0]\n"
+ "ldp x24, x27, [x20], #0x10\n"
+ "fmla z31.s, z8.s, z7.s[1]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla z30.s, z8.s, z7.s[2]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla z29.s, z8.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x24]\n"
+ "fmla z28.s, z8.s, z6.s[0]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla z27.s, z8.s, z6.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla z26.s, z8.s, z6.s[2]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z25.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+ "fmla z24.s, z8.s, z5.s[0]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla z23.s, z8.s, z5.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla z21.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x27]\n"
+ "fmla z20.s, z8.s, z4.s[0]\n"
+ "fmla z19.s, z8.s, z4.s[1]\n"
+ "fmla z18.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z8.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+ "fmla z9.s, z16.s, z3.s[0]\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "fmla z31.s, z16.s, z3.s[1]\n"
+ "fmla z30.s, z16.s, z3.s[2]\n"
+ "fmla z29.s, z16.s, z3.s[3]\n"
+ "fmla z28.s, z16.s, z2.s[0]\n"
+ "fmla z27.s, z16.s, z2.s[1]\n"
+ "fmla z26.s, z16.s, z2.s[2]\n"
+ "fmla z25.s, z16.s, z2.s[3]\n"
+ "fmla z24.s, z16.s, z1.s[0]\n"
+ "fmla z23.s, z16.s, z1.s[1]\n"
+ "fmla z22.s, z16.s, z1.s[2]\n"
+ "fmla z21.s, z16.s, z1.s[3]\n"
+ "fmla z20.s, z16.s, z0.s[0]\n"
+ "fmla z19.s, z16.s, z0.s[1]\n"
+ "fmla z18.s, z16.s, z0.s[2]\n"
+ "fmla z17.s, z16.s, z0.s[3]\n"
+ "fmla z9.s, z8.s, z7.s[0]\n"
+ "fmla z31.s, z8.s, z7.s[1]\n"
+ "fmla z30.s, z8.s, z7.s[2]\n"
+ "fmla z29.s, z8.s, z7.s[3]\n"
+ "fmla z28.s, z8.s, z6.s[0]\n"
+ "fmla z27.s, z8.s, z6.s[1]\n"
+ "fmla z26.s, z8.s, z6.s[2]\n"
+ "fmla z25.s, z8.s, z6.s[3]\n"
+ "fmla z24.s, z8.s, z5.s[0]\n"
+ "fmla z23.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z21.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z4.s[0]\n"
+ "fmla z19.s, z8.s, z4.s[1]\n"
+ "fmla z18.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z8.s, z4.s[3]\n"
+ "fmin z9.s, p1/M, z9.s, z10.s\n"
+ "fmin z31.s, p1/M, z31.s, z10.s\n"
+ "fmin z30.s, p1/M, z30.s, z10.s\n"
+ "fmin z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z9.s, p1/M, z9.s, z11.s\n"
+ "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z31.s, p1/M, z31.s, z11.s\n"
+ "fmax z30.s, p1/M, z30.s, z11.s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmax z29.s, p1/M, z29.s, z11.s\n"
+ "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+ "fmin z28.s, p1/M, z28.s, z10.s\n"
+ "fmin z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z26.s, p1/M, z26.s, z10.s\n"
+ "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z25.s, p1/M, z25.s, z10.s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin z24.s, p1/M, z24.s, z10.s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax z28.s, p1/M, z28.s, z11.s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax z27.s, p1/M, z27.s, z11.s\n"
+ "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z11.s\n"
+ "fmax z25.s, p1/M, z25.s, z11.s\n"
+ "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z24.s, p1/M, z24.s, z11.s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z23.s, p1/M, z23.s, z10.s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmin z22.s, p1/M, z22.s, z10.s\n"
+ "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+ "fmin z21.s, p1/M, z21.s, z10.s\n"
+ "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+ "fmin z20.s, p1/M, z20.s, z10.s\n"
+ "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z23.s, p1/M, z23.s, z11.s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax z22.s, p1/M, z22.s, z11.s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax z21.s, p1/M, z21.s, z11.s\n"
+ "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+ "fmax z20.s, p1/M, z20.s, z11.s\n"
+ "fmin z19.s, p1/M, z19.s, z10.s\n"
+ "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z18.s, p1/M, z18.s, z10.s\n"
+ "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z17.s, p1/M, z17.s, z10.s\n"
+ "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z19.s, p1/M, z19.s, z11.s\n"
+ "fmax z18.s, p1/M, z18.s, z11.s\n"
+ "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z17.s, p1/M, z17.s, z11.s\n"
+ "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+ "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla z9.s, z8.s, z7.s[0]\n"
+ "ldr x19, [%x[outptrs], #0x0]\n"
+ "fmla z31.s, z8.s, z7.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x8]\n"
+ "fmla z30.s, z8.s, z7.s[2]\n"
+ "ldr x21, [%x[outptrs], #0x10]\n"
+ "fmla z29.s, z8.s, z7.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z6.s[0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z27.s, z8.s, z6.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x28]\n"
+ "fmla z26.s, z8.s, z6.s[2]\n"
+ "ldr x25, [%x[outptrs], #0x30]\n"
+ "fmla z25.s, z8.s, z6.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z5.s[0]\n"
+ "fmla z23.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z21.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z4.s[0]\n"
+ "fmla z19.s, z8.s, z4.s[1]\n"
+ "fmla z18.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z8.s, z4.s[3]\n"
+ "fmin z9.s, p1/M, z9.s, z10.s\n"
+ "fmin z31.s, p1/M, z31.s, z10.s\n"
+ "fmin z30.s, p1/M, z30.s, z10.s\n"
+ "fmin z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z9.s, p1/M, z9.s, z11.s\n"
+ "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z31.s, p1/M, z31.s, z11.s\n"
+ "fmax z30.s, p1/M, z30.s, z11.s\n"
+ "ldr x19, [%x[outptrs], #0x40]\n"
+ "fmax z29.s, p1/M, z29.s, z11.s\n"
+ "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+ "fmin z28.s, p1/M, z28.s, z10.s\n"
+ "fmin z27.s, p1/M, z27.s, z10.s\n"
+ "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z26.s, p1/M, z26.s, z10.s\n"
+ "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z25.s, p1/M, z25.s, z10.s\n"
+ "ldr x20, [%x[outptrs], #0x48]\n"
+ "fmin z24.s, p1/M, z24.s, z10.s\n"
+ "ldr x21, [%x[outptrs], #0x50]\n"
+ "fmax z28.s, p1/M, z28.s, z11.s\n"
+ "ldr x22, [%x[outptrs], #0x58]\n"
+ "fmax z27.s, p1/M, z27.s, z11.s\n"
+ "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z11.s\n"
+ "fmax z25.s, p1/M, z25.s, z11.s\n"
+ "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z24.s, p1/M, z24.s, z11.s\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z23.s, p1/M, z23.s, z10.s\n"
+ "ldr x24, [%x[outptrs], #0x68]\n"
+ "fmin z22.s, p1/M, z22.s, z10.s\n"
+ "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+ "fmin z21.s, p1/M, z21.s, z10.s\n"
+ "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+ "fmin z20.s, p1/M, z20.s, z10.s\n"
+ "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+ "fmax z23.s, p1/M, z23.s, z11.s\n"
+ "ldr x25, [%x[outptrs], #0x70]\n"
+ "fmax z22.s, p1/M, z22.s, z11.s\n"
+ "ldr x26, [%x[outptrs], #0x78]\n"
+ "fmax z21.s, p1/M, z21.s, z11.s\n"
+ "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+ "fmax z20.s, p1/M, z20.s, z11.s\n"
+ "fmin z19.s, p1/M, z19.s, z10.s\n"
+ "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+ "fmin z18.s, p1/M, z18.s, z10.s\n"
+ "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+ "fmin z17.s, p1/M, z17.s, z10.s\n"
+ "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+ "fmax z19.s, p1/M, z19.s, z11.s\n"
+ "fmax z18.s, p1/M, z18.s, z11.s\n"
+ "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+ "fmax z17.s, p1/M, z17.s, z11.s\n"
+ "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+ "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+ "7:" // Output channel loop: Done
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_output_channels]\n"
+ "b.any 1b\n"
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..295e1f6450
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..90f924a8ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "mov x19, #0x1\n"
+ "ldp x25, x24, [%x[inptrs], #0x30]\n"
+ "orr x19, x19, #0x100\n"
+ "ldp x23, x22, [%x[outptrs], #0x0]\n"
+ "orr x19, x19, #0x10000\n"
+ "dup z12.s, w19\n"
+ "ldp x21, x20, [%x[outptrs], #0x10]\n"
+ "mov x19, #0x0\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "whilelt p1.b, x19, %x[n_channels]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "1:" // Loop
+ "mov z7.s, #0x0\n"
+ "ld1b { z19.b }, p1/Z, [x11, x19]\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "mov z6.s, #0x0\n"
+ "ld1b { z18.b }, p1/Z, [x10, x19]\n"
+ "ldp x11, x10, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p1/Z, [x9, x19]\n"
+ "zip1 z21.b, z19.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, x19]\n"
+ "zip2 z19.b, z19.b, z16.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x50]\n"
+ "ld1b { z23.b }, p1/Z, [x27, x19]\n"
+ "zip1 z16.b, z18.b, z17.b\n"
+ "ld1b { z20.b }, p1/Z, [x26, x19]\n"
+ "zip2 z18.b, z18.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x60]\n"
+ "zip1 z5.b, z21.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x25, x19]\n"
+ "zip2 z4.b, z21.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x24, x19]\n"
+ "zip1 z29.b, z19.b, z18.b\n"
+ "ldp x25, x24, [%x[inptrs], #0x70]\n"
+ "zip2 z28.b, z19.b, z18.b\n"
+ "ld1b { z22.b }, p1/Z, [x11, x19]\n"
+ "zip1 z19.b, z23.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x10, x19]\n"
+ "zip2 z27.b, z23.b, z17.b\n"
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "zip1 z18.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x9, x19]\n"
+ "zip2 z20.b, z20.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x28, x19]\n"
+ "zip1 z3.b, z19.b, z18.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "zip2 z2.b, z19.b, z18.b\n"
+ "ld1b { z19.b }, p1/Z, [x27, x19]\n"
+ "zip1 z26.b, z22.b, z17.b\n"
+ "ld1b { z25.b }, p1/Z, [x26, x19]\n"
+ "zip2 z24.b, z22.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "zip1 z23.b, z21.b, z16.b\n"
+ "ld1b { z18.b }, p1/Z, [x25, x19]\n"
+ "zip2 z22.b, z21.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x24, x19]\n"
+ "zip1 z17.b, z27.b, z20.b\n"
+ "ldp x25, x24, [%x[inptrs], #0x30]\n"
+ "zip2 z16.b, z27.b, z20.b\n"
+ "st1b { z29.b }, p2, [SP]\n"
+ "zip1 z20.b, z19.b, z18.b\n"
+ "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+ "zip2 z19.b, z19.b, z18.b\n"
+ "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+ "zip1 z18.b, z25.b, z21.b\n"
+ "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+ "zip2 z17.b, z25.b, z21.b\n"
+ "ld1w { z1.s }, p2/Z, [%x[params]]\n"
+ "zip1 z0.b, z26.b, z23.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z30.b, z26.b, z23.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+ "zip2 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+ "zip1 z28.b, z20.b, z18.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip2 z26.b, z20.b, z18.b\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+ "zip2 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+ "mov z24.d, z1.d\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z22.d, z1.d\n"
+ "mov z21.d, z1.d\n"
+ "sdot z1.s, z31.b, z5.b\n"
+ "sdot z22.s, z31.b, z3.b\n"
+ "sdot z7.s, z12.b, z3.b\n"
+ "sdot z1.s, z29.b, z3.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "sdot z22.s, z29.b, z0.b\n"
+ "sdot z7.s, z12.b, z0.b\n"
+ "sdot z1.s, z27.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z22.s, z27.b, z28.b\n"
+ "mov z20.d, z7.d\n"
+ "sdot z7.s, z12.b, z5.b\n"
+ "sdot z20.s, z12.b, z28.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "sdot z21.s, z31.b, z3.b\n"
+ "sdot z6.s, z12.b, z3.b\n"
+ "sdot z24.s, z31.b, z5.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "sdot z21.s, z29.b, z0.b\n"
+ "sdot z6.s, z12.b, z0.b\n"
+ "sdot z24.s, z29.b, z3.b\n"
+ "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "sdot z21.s, z27.b, z28.b\n"
+ "mov z19.d, z6.d\n"
+ "sdot z24.s, z27.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
+ "sdot z6.s, z12.b, z5.b\n"
+ "ld1b { z5.b }, p2/Z, [SP]\n"
+ "sdot z19.s, z12.b, z28.b\n"
+ "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "mov z7.s, #0x0\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "sdot z7.s, z12.b, z2.b\n"
+ "mov z6.s, #0x0\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ "sdot z7.s, z12.b, z30.b\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z17.d, z22.d, z23.d\n"
+ "mov z20.d, z7.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sdot z7.s, z12.b, z4.b\n"
+ "sdot z20.s, z12.b, z26.b\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z22.s, z22.s, z8.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "mov z22.d, z1.d\n"
+ "sdot z22.s, z31.b, z2.b\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "sdot z22.s, z29.b, z30.b\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "sdot z1.s, z31.b, z4.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "sdot z22.s, z27.b, z26.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "sdot z1.s, z29.b, z2.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z24.s, z31.b, z4.b\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "sdot z1.s, z27.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z21.s, z31.b, z2.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z24.s, z29.b, z2.b\n"
+ "sdot z6.s, z12.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "sdot z21.s, z29.b, z30.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "sdot z24.s, z27.b, z30.b\n"
+ "sdot z6.s, z12.b, z30.b\n"
+ "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
+ "and z17.d, z22.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sdot z21.s, z27.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "mov z19.d, z6.d\n"
+ "sdot z6.s, z12.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
+ "sdot z19.s, z12.b, z26.b\n"
+ "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "mov z7.s, #0x0\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "sdot z7.s, z12.b, z3.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mov z6.s, #0x0\n"
+ "sdot z7.s, z12.b, z0.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "mov z20.d, z7.d\n"
+ "sdot z7.s, z12.b, z5.b\n"
+ "sdot z20.s, z12.b, z28.b\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "mov z22.d, z1.d\n"
+ "sdot z22.s, z31.b, z3.b\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "sdot z1.s, z31.b, z5.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "sdot z22.s, z29.b, z0.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "sdot z1.s, z29.b, z3.b\n"
+ "sdot z22.s, z27.b, z28.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "sdot z24.s, z31.b, z5.b\n"
+ "sdot z1.s, z27.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z21.s, z31.b, z3.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z29.b, z3.b\n"
+ "sdot z6.s, z12.b, z3.b\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "sdot z21.s, z29.b, z0.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z24.s, z27.b, z0.b\n"
+ "sdot z6.s, z12.b, z0.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "sdot z21.s, z27.b, z28.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z7.s, #0x0\n"
+ "mov z19.d, z6.d\n"
+ "sdot z6.s, z12.b, z5.b\n"
+ "sdot z19.s, z12.b, z28.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sdot z7.s, z12.b, z2.b\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mov z6.s, #0x0\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z17.d, z22.d, z23.d\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sdot z7.s, z12.b, z30.b\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "mov z20.d, z7.d\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "sdot z7.s, z12.b, z4.b\n"
+ "sdot z20.s, z12.b, z26.b\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "mov z22.d, z1.d\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "sdot z1.s, z31.b, z4.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "sdot z22.s, z31.b, z2.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "sdot z1.s, z29.b, z2.b\n"
+ "sdot z22.s, z29.b, z30.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z24.s, z31.b, z4.b\n"
+ "sdot z1.s, z27.b, z30.b\n"
+ "sdot z22.s, z27.b, z26.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "sdot z21.s, z31.b, z2.b\n"
+ "sdot z24.s, z29.b, z2.b\n"
+ "sdot z6.s, z12.b, z2.b\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "sdot z21.s, z29.b, z30.b\n"
+ "sdot z24.s, z27.b, z30.b\n"
+ "sdot z6.s, z12.b, z30.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "sdot z21.s, z27.b, z26.b\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "mov z19.d, z6.d\n"
+ "sdot z6.s, z12.b, z4.b\n"
+ "sdot z19.s, z12.b, z26.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "and z17.d, z22.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z16.d, z21.d, z23.d\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "incw x19\n"
+ "whilelt p1.b, x19, %x[n_channels]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7dd241a8cf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8bf5badfaf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z18.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z13.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x15, x17\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x15, x17\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "mov x19, x15\n"
+ "incw x19\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x17\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z17.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z17.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "mov z9.d, z11.d\n"
+ "ld1sb { z0.h }, p4/Z, [x16]\n"
+ ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n"
+ "mov z20.d, z17.d\n"
+ "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "mov z24.d, z11.d\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n"
+ "mov z19.d, z17.d\n"
+ "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "mov z26.d, z11.d\n"
+ "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n"
+ "mov z23.d, z17.d\n"
+ "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n"
+ "inch x16, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p4/Z, [x16]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n"
+ ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n"
+ ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
+ "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n"
+ "1:" // Loop
+ ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "whilelt p0.h, x14, x17\n"
+ ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "inch x16\n"
+ ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n"
+ "ldr x26, [x12, #0x38]\n"
+ ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n"
+ "ldr x25, [x12, #0x40]\n"
+ ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
+ "ldr x19, [x12, #0x48]\n"
+ ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n"
+ "ldr x24, [x12, #0x50]\n"
+ ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n"
+ "ldr x23, [x12, #0x58]\n"
+ ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
+ "ld1sb { z31.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x12, #0x60]\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z30.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ "ldr x21, [x12, #0x68]\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
+ "ldr x20, [x12, #0x70]\n"
+ ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n"
+ "ld1w { z25.s }, p2/Z, [x13]\n"
+ ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n"
+ "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
+ "addvl x13, x13, #2\n"
+ ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n"
+ "uzp1 z10.s, z25.s, z16.s\n"
+ "uzp2 z22.s, z25.s, z16.s\n"
+ "ld1w { z25.s }, p2/Z, [x11]\n"
+ ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x26, x15]\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
+ "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n"
+ "ld1sb { z31.h }, p3/Z, [x25, x15]\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "uzp1 z21.s, z25.s, z16.s\n"
+ "uzp2 z25.s, z25.s, z16.s\n"
+ ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
+ "ld1sb { z29.h }, p3/Z, [x24, x15]\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z28.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n"
+ "ld1sb { z31.h }, p3/Z, [x22, x15]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n"
+ "ld1sb { z30.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
+ ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n"
+ "ld1sb { z28.h }, p3/Z, [x19, x15]\n"
+ "inch x15\n"
+ ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
+ "whilelt p2.s, x15, x17\n"
+ ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
+ "mov x19, x15\n"
+ ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n"
+ "whilelt p1.s, x19, x17\n"
+ ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n"
+ "whilelt p3.h, x15, x17\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
+ "and z16.d, z11.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z1.d, z17.d, z25.d\n"
+ "and z27.d, z9.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n"
+ ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n"
+ "sqadd z11.s, z11.s, z16.s\n"
+ ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
+ "and z16.d, z20.d, z25.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z1.s\n"
+ "sqadd z9.s, z9.s, z27.s\n"
+ ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
+ "sqadd z20.s, z20.s, z16.s\n"
+ ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n"
+ ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
+ ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n"
+ "and z16.d, z24.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z7.d, z19.d, z25.d\n"
+ "and z3.d, z26.d, z21.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
+ "add z11.s, z11.s, z15.s\n"
+ "add z17.s, z17.s, z15.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "add z9.s, z9.s, z15.s\n"
+ "sqadd z26.s, z26.s, z3.s\n"
+ "and z16.d, z23.d, z25.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "smin z11.s, p4/M, z11.s, z14.s\n"
+ "smin z17.s, p4/M, z17.s, z14.s\n"
+ "smin z9.s, p4/M, z9.s, z14.s\n"
+ ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n"
+ ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n"
+ "smax z11.s, p4/M, z11.s, z13.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "add z20.s, z20.s, z15.s\n"
+ "add z24.s, z24.s, z15.s\n"
+ "smax z17.s, p4/M, z17.s, z13.s\n"
+ "smax z9.s, p4/M, z9.s, z13.s\n"
+ "smin z20.s, p4/M, z20.s, z14.s\n"
+ "smin z24.s, p4/M, z24.s, z14.s\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "st1b { z11.h }, p0, [x10, x14]\n"
+ "smax z20.s, p4/M, z20.s, z13.s\n"
+ ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n"
+ "smax z24.s, p4/M, z24.s, z13.s\n"
+ ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n"
+ ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n"
+ "trn1 z9.h, z9.h, z20.h\n"
+ "st1b { z9.h }, p0, [x9, x14]\n"
+ "add z19.s, z19.s, z15.s\n"
+ "add z26.s, z26.s, z15.s\n"
+ "add z23.s, z23.s, z15.s\n"
+ "smin z19.s, p4/M, z19.s, z14.s\n"
+ "smin z26.s, p4/M, z26.s, z14.s\n"
+ "smin z23.s, p4/M, z23.s, z14.s\n"
+ "smax z19.s, p4/M, z19.s, z13.s\n"
+ "smax z26.s, p4/M, z26.s, z13.s\n"
+ "smax z23.s, p4/M, z23.s, z13.s\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "st1b { z24.h }, p0, [x28, x14]\n"
+ "trn1 z26.h, z26.h, z23.h\n"
+ "st1b { z26.h }, p0, [x27, x14]\n"
+ "inch x14\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z17.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z17.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "mov z9.d, z11.d\n"
+ "ld1sb { z0.h }, p4/Z, [x16]\n"
+ ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n"
+ "mov z20.d, z17.d\n"
+ "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "mov z24.d, z11.d\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n"
+ "mov z19.d, z17.d\n"
+ "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "mov z26.d, z11.d\n"
+ "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n"
+ "mov z23.d, z17.d\n"
+ "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n"
+ "inch x16, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p4/Z, [x16]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n"
+ ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n"
+ ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
+ "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..89507ef9ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b773ca1fe6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x7, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x8, #0x0\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z19.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z20.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x7, x5\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x7, x5\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x19, x7\n"
+ "incw x19\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x5\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z18.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z18.s, z16.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x6]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+ "mov z18.d, z13.d\n"
+ "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+ ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
+ "mov z10.d, z16.d\n"
+ "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+ "mov z22.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "mov z23.d, z16.d\n"
+ "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ "inch x6, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p4/Z, [x6]\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n"
+ "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
+ "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n"
+ "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ "1:" // Loop
+ ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
+ "ldr x23, [x16, #0x40]\n"
+ "whilelt p0.h, x8, x5\n"
+ ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n"
+ "ldr x22, [x16, #0x48]\n"
+ "inch x6\n"
+ ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
+ "ldr x21, [x16, #0x50]\n"
+ ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n"
+ "ldr x19, [x16, #0x60]\n"
+ ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
+ "ldr x10, [x16, #0x68]\n"
+ ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n"
+ "ldr x9, [x16, #0x70]\n"
+ ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
+ "ldr x28, [x16, #0x78]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
+ "ldr x27, [x16, #0x80]\n"
+ ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n"
+ "ldr x26, [x16, #0x88]\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ "ldr x25, [x16, #0x90]\n"
+ ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
+ ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
+ "ldr x24, [x16, #0x98]\n"
+ ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
+ "ldr x23, [x16, #0xa0]\n"
+ ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
+ "ldr x22, [x16, #0xa8]\n"
+ ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
+ "ldr x21, [x16, #0xb0]\n"
+ ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
+ "ldr x19, [x16, #0xc0]\n"
+ ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
+ "ld1w { z21.s }, p2/Z, [x17]\n"
+ ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n"
+ "ld1sb { z24.h }, p3/Z, [x9, x7]\n"
+ ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
+ ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n"
+ "ld1sb { z29.h }, p3/Z, [x10, x7]\n"
+ "addvl x17, x17, #2\n"
+ ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ "uzp1 z30.s, z21.s, z17.s\n"
+ "uzp2 z31.s, z21.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x15]\n"
+ ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
+ "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ "ld1sb { z28.h }, p3/Z, [x27, x7]\n"
+ ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
+ ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
+ "ld1sb { z27.h }, p3/Z, [x28, x7]\n"
+ ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n"
+ "ld1sb { z26.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z25.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ "uzp1 z0.s, z21.s, z17.s\n"
+ "uzp2 z21.s, z21.s, z17.s\n"
+ ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
+ "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z24.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n"
+ ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n"
+ ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ "and z4.d, z13.d, z0.d\n"
+ "and z17.d, z16.d, z21.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ "sqadd z13.s, z13.s, z4.s\n"
+ "sqadd z16.s, z16.s, z17.s\n"
+ ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n"
+ "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x19, x7]\n"
+ "inch x7\n"
+ ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n"
+ "whilelt p2.s, x7, x5\n"
+ ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
+ "mov x19, x7\n"
+ ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n"
+ "incw x19\n"
+ ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n"
+ "whilelt p1.s, x19, x5\n"
+ "and z1.d, z11.d, z0.d\n"
+ "whilelt p3.h, x7, x5\n"
+ "and z17.d, z9.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
+ ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n"
+ "sqadd z11.s, z11.s, z1.s\n"
+ "sqadd z9.s, z9.s, z17.s\n"
+ "add z13.s, z13.s, z14.s\n"
+ ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
+ ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n"
+ "and z17.d, z18.d, z0.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n"
+ ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n"
+ "smin z13.s, p4/M, z13.s, z15.s\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "and z1.d, z10.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "add z16.s, z16.s, z14.s\n"
+ "sqadd z18.s, z18.s, z17.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
+ "smax z13.s, p4/M, z13.s, z20.s\n"
+ "smin z16.s, p4/M, z16.s, z15.s\n"
+ "sqadd z10.s, z10.s, z1.s\n"
+ "and z2.d, z22.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
+ "smax z16.s, p4/M, z16.s, z20.s\n"
+ ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
+ ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
+ ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "st1b { z13.h }, p0, [x14, x8]\n"
+ "add z11.s, z11.s, z14.s\n"
+ "add z9.s, z9.s, z14.s\n"
+ "add z18.s, z18.s, z14.s\n"
+ "sqadd z22.s, z22.s, z2.s\n"
+ "and z16.d, z23.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "smin z11.s, p4/M, z11.s, z15.s\n"
+ "smin z9.s, p4/M, z9.s, z15.s\n"
+ "smin z18.s, p4/M, z18.s, z15.s\n"
+ ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n"
+ ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n"
+ "smax z11.s, p4/M, z11.s, z20.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "add z10.s, z10.s, z14.s\n"
+ "add z22.s, z22.s, z14.s\n"
+ "smax z9.s, p4/M, z9.s, z20.s\n"
+ "smax z18.s, p4/M, z18.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z15.s\n"
+ "smin z22.s, p4/M, z22.s, z15.s\n"
+ "trn1 z11.h, z11.h, z9.h\n"
+ "st1b { z11.h }, p0, [x13, x8]\n"
+ "smax z10.s, p4/M, z10.s, z20.s\n"
+ ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n"
+ "smax z22.s, p4/M, z22.s, z20.s\n"
+ "trn1 z18.h, z18.h, z10.h\n"
+ "st1b { z18.h }, p0, [x12, x8]\n"
+ "add z23.s, z23.s, z14.s\n"
+ "smin z23.s, p4/M, z23.s, z15.s\n"
+ "smax z23.s, p4/M, z23.s, z20.s\n"
+ "trn1 z22.h, z22.h, z23.h\n"
+ "st1b { z22.h }, p0, [x11, x8]\n"
+ "inch x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z18.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z18.s, z16.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x6]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+ "mov z18.d, z13.d\n"
+ "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+ ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
+ "mov z10.d, z16.d\n"
+ "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+ "mov z22.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "mov z23.d, z16.d\n"
+ "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ "inch x6, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p4/Z, [x6]\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n"
+ "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
+ "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n"
+ "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n"
+ "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n"
+ ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n"
+ ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n"
+ ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..54ac1c2e0b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c02bb584e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x2, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x3, #0x0\n"
+ "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z17.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z5.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x2, x0\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x2, x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "mov x19, x2\n"
+ "incw x19\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x0\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z19.s }, p2/Z, [x19]\n"
+ "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z19.s, z6.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z19.s, z6.s\n"
+ "mov z19.d, z11.d\n"
+ "ld1sb { z0.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z7.d, z11.d\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ "mov z6.d, z16.d\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z12.d, z11.d\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "mov z8.d, z16.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
+ "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
+ "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
+ "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
+ "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ "1:" // Loop
+ ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "whilelt p0.h, x3, x0\n"
+ ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
+ "ldr x19, [x5, #0x58]\n"
+ ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
+ "ldr x25, [x5, #0x60]\n"
+ ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n"
+ "ldr x24, [x5, #0x68]\n"
+ ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n"
+ "ldr x23, [x5, #0x70]\n"
+ ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n"
+ "ldr x22, [x5, #0x78]\n"
+ ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ "ldr x15, [x5, #0x80]\n"
+ ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
+ "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n"
+ "ldr x19, [x5, #0x98]\n"
+ ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n"
+ "ldr x14, [x5, #0xa0]\n"
+ ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n"
+ "ldr x13, [x5, #0xa8]\n"
+ ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
+ "ldr x12, [x5, #0xb0]\n"
+ ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
+ "ldr x11, [x5, #0xb8]\n"
+ ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
+ "ldr x10, [x5, #0xc0]\n"
+ ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n"
+ "ldr x9, [x5, #0xc8]\n"
+ ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n"
+ "ldr x28, [x5, #0xd0]\n"
+ ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n"
+ "ldr x27, [x5, #0xd8]\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ "ldr x26, [x5, #0xe0]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
+ "ldr x25, [x5, #0xe8]\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
+ "ld1w { z18.s }, p2/Z, [x4]\n"
+ ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n"
+ "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
+ "addvl x4, x4, #2\n"
+ ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "uzp1 z21.s, z18.s, z20.s\n"
+ "uzp2 z10.s, z18.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x6]\n"
+ ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
+ "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
+ "addvl x6, x6, #2\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n"
+ "ldr x24, [x5, #0xf0]\n"
+ ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n"
+ "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n"
+ "ldr x23, [x5, #0xf8]\n"
+ ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ "uzp1 z29.s, z18.s, z20.s\n"
+ "uzp2 z20.s, z18.s, z20.s\n"
+ ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
+ "ld1sb { z28.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
+ ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n"
+ "ldr x22, [x5, #0x100]\n"
+ ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
+ "ld1sb { z23.h }, p3/Z, [x15, x2]\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n"
+ "ldr x21, [x5, #0x108]\n"
+ ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n"
+ "ldr x20, [x5, #0x110]\n"
+ ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
+ "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
+ "ldr x19, [x5, #0x118]\n"
+ ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z26.h }, p3/Z, [x14, x2]\n"
+ ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
+ ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
+ "ld1sb { z22.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
+ "ld1sb { z25.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
+ "ld1sb { z24.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
+ "ld1sb { z27.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
+ "ld1sb { z23.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z28.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
+ ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
+ "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
+ "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
+ "ld1sb { z26.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
+ ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1]\n"
+ "inch x1\n"
+ ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
+ "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z24.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z27.h }, p3/Z, [x19, x2]\n"
+ "inch x2\n"
+ ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
+ "whilelt p2.s, x2, x0\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "mov x19, x2\n"
+ ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n"
+ "whilelt p1.s, x19, x0\n"
+ ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n"
+ "whilelt p3.h, x2, x0\n"
+ ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n"
+ ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n"
+ ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n"
+ ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
+ "and z31.d, z11.d, z29.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z23.d, z16.d, z20.d\n"
+ "and z25.d, z19.d, z29.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "and z18.d, z9.d, z20.d\n"
+ ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z31.s\n"
+ ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "sqadd z19.s, z19.s, z25.s\n"
+ ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
+ "sqadd z9.s, z9.s, z18.s\n"
+ "and z1.d, z7.d, z29.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "and z18.d, z6.d, z20.d\n"
+ ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n"
+ "and z30.d, z12.d, z29.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "add z11.s, z11.s, z14.s\n"
+ "sqadd z7.s, z7.s, z1.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "smin z11.s, p4/M, z11.s, z15.s\n"
+ ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n"
+ "sqadd z12.s, z12.s, z30.s\n"
+ "and z3.d, z8.d, z20.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "add z16.s, z16.s, z14.s\n"
+ "smax z11.s, p4/M, z11.s, z5.s\n"
+ ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n"
+ ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n"
+ "smin z16.s, p4/M, z16.s, z15.s\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "add z19.s, z19.s, z14.s\n"
+ "add z9.s, z9.s, z14.s\n"
+ "sqadd z8.s, z8.s, z3.s\n"
+ "add z7.s, z7.s, z14.s\n"
+ "smax z16.s, p4/M, z16.s, z5.s\n"
+ "smin z19.s, p4/M, z19.s, z15.s\n"
+ "smin z9.s, p4/M, z9.s, z15.s\n"
+ "smin z7.s, p4/M, z7.s, z15.s\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "st1b { z11.h }, p0, [x7, x3]\n"
+ "smax z19.s, p4/M, z19.s, z5.s\n"
+ "smax z9.s, p4/M, z9.s, z5.s\n"
+ "smax z7.s, p4/M, z7.s, z5.s\n"
+ ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n"
+ ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n"
+ "trn1 z19.h, z19.h, z9.h\n"
+ "st1b { z19.h }, p0, [x8, x3]\n"
+ "add z6.s, z6.s, z14.s\n"
+ ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n"
+ "add z12.s, z12.s, z14.s\n"
+ "smin z6.s, p4/M, z6.s, z15.s\n"
+ "add z8.s, z8.s, z14.s\n"
+ "smin z12.s, p4/M, z12.s, z15.s\n"
+ "smax z6.s, p4/M, z6.s, z5.s\n"
+ "smin z8.s, p4/M, z8.s, z15.s\n"
+ "smax z12.s, p4/M, z12.s, z5.s\n"
+ "trn1 z7.h, z7.h, z6.h\n"
+ "st1b { z7.h }, p0, [x17, x3]\n"
+ "smax z8.s, p4/M, z8.s, z5.s\n"
+ "trn1 z12.h, z12.h, z8.h\n"
+ "st1b { z12.h }, p0, [x16, x3]\n"
+ "inch x3\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z19.s }, p2/Z, [x19]\n"
+ "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z19.s, z6.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z19.s, z6.s\n"
+ "mov z19.d, z11.d\n"
+ "ld1sb { z0.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z7.d, z11.d\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n"
+ "mov z6.d, z16.d\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z12.d, z11.d\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "mov z8.d, z16.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n"
+ "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n"
+ "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
+ "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
+ "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..7ab83e8659
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 9;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+ sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f531912e72
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov z31.s, #0x0\n"
+ "ldr x24, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "mov z18.s, #0x0\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "lsl x9, %x[n_channels], #0x2\n"
+ "mov z29.s, #0x0\n"
+ "ldr x22, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "mov z28.s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "mov x19, #0x9\n"
+ "mov z13.s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "mov z14.s, #0x0\n"
+ "ld1b { z7.b }, p1/Z, [x24]\n"
+ "mov x19, #0x3\n"
+ "mov z15.s, #0x0\n"
+ "ld1b { z3.b }, p1/Z, [x23]\n"
+ "whilelt p0.b, XZR, x19\n"
+ "mov z11.b, p0/z, #0x1\n"
+ "ld1b { z4.b }, p1/Z, [x22]\n"
+ "mov x28, #0x0\n"
+ "mov z10.d, z7.d\n"
+ "ld1b { z6.b }, p1/Z, [x21]\n"
+ "mov x27, #0x0\n"
+ "ext z10.b, z10.b, z10.b, #0x2\n"
+ "ld1b { z5.b }, p1/Z, [x20]\n"
+ "whilelt p1.b, x28, x9\n"
+ "mov z17.d, z7.d\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z26.d, z7.d\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov z19.d, z3.d\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "ext z19.b, z19.b, z19.b, #0x2\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z7.s, z7.s, z17.s\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "zip1 z10.s, z10.s, z26.s\n"
+ "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z7.s, z7.s, z10.s\n"
+ "ld1w { z1.s }, p1/Z, [%x[params]]\n"
+ "mov z7.q, z7.q[0]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z17.d, z3.d\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z2.d, z3.d\n"
+ "mov z20.d, z4.d\n"
+ "ext z2.b, z2.b, z2.b, #0x6\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "ext z20.b, z20.b, z20.b, #0x2\n"
+ "mov z17.d, z4.d\n"
+ "zip1 z19.s, z19.s, z2.s\n"
+ "zip1 z3.s, z3.s, z19.s\n"
+ "mov z3.q, z3.q[0]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z26.d, z4.d\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "mov z21.d, z6.d\n"
+ "zip1 z4.s, z4.s, z17.s\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "zip1 z20.s, z20.s, z26.s\n"
+ "zip1 z4.s, z4.s, z20.s\n"
+ "mov z4.q, z4.q[0]\n"
+ "mov z17.d, z6.d\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z20.d, z6.d\n"
+ "ext z20.b, z20.b, z20.b, #0x6\n"
+ "mov z19.d, z5.d\n"
+ "zip1 z6.s, z6.s, z17.s\n"
+ "ext z19.b, z19.b, z19.b, #0x2\n"
+ "zip1 z21.s, z21.s, z20.s\n"
+ "zip1 z6.s, z6.s, z21.s\n"
+ "mov z6.q, z6.q[0]\n"
+ "mov z17.d, z5.d\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z20.d, z5.d\n"
+ "ext z20.b, z20.b, z20.b, #0x6\n"
+ "mov z11.s, z11.s[0]\n"
+ "zip1 z5.s, z5.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "zip1 z19.s, z19.s, z20.s\n"
+ "zip1 z5.s, z5.s, z19.s\n"
+ "mov z5.q, z5.q[0]\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z2.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "sdot z18.s, z11.b, z7.b[1]\n"
+ "sdot z29.s, z11.b, z7.b[2]\n"
+ "sdot z28.s, z11.b, z7.b[3]\n"
+ "sdot z13.s, z11.b, z3.b[0]\n"
+ "sdot z14.s, z11.b, z3.b[1]\n"
+ "sdot z15.s, z11.b, z3.b[2]\n"
+ "sdot z25.s, z11.b, z3.b[3]\n"
+ "sdot z26.s, z11.b, z4.b[0]\n"
+ "sdot z27.s, z11.b, z4.b[1]\n"
+ "sdot z24.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z4.b[3]\n"
+ "sdot z22.s, z11.b, z6.b[0]\n"
+ "sdot z21.s, z11.b, z6.b[1]\n"
+ "sdot z17.s, z11.b, z6.b[2]\n"
+ "sdot z20.s, z11.b, z6.b[3]\n"
+ "sdot z2.s, z11.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z5.b[1]\n"
+ "mov z31.d, z31.d\n"
+ "mov z18.d, z18.d\n"
+ "mov z29.d, z29.d\n"
+ "mov z28.d, z28.d\n"
+ "add z31.s, z31.s, z13.s\n"
+ "mov z13.s, #0x0\n"
+ "sdot z13.s, z11.b, z5.b[2]\n"
+ "add z18.s, z18.s, z14.s\n"
+ "mov z14.s, #0x0\n"
+ "sdot z14.s, z11.b, z5.b[3]\n"
+ "add z29.s, z29.s, z15.s\n"
+ "add z28.s, z28.s, z25.s\n"
+ "add z31.s, z31.s, z26.s\n"
+ "add z18.s, z18.s, z27.s\n"
+ "add z29.s, z29.s, z24.s\n"
+ "add z28.s, z28.s, z23.s\n"
+ "mov z26.d, z26.d\n"
+ "mov z25.d, z27.d\n"
+ "mov z24.d, z24.d\n"
+ "mov z23.d, z23.d\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z24.s, z24.s, z17.s\n"
+ "add z23.s, z23.s, z20.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z23.s, z23.s, z14.s\n"
+ "neg z30.s, p2/M, z30.s\n"
+ "mul z31.s, p2/M, z31.s, z30.s\n"
+ "st1w { z31.s }, p2, [SP]\n"
+ "add z31.s, z31.s, z1.s\n"
+ "mul z18.s, p2/M, z18.s, z30.s\n"
+ "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
+ "add z18.s, z18.s, z1.s\n"
+ "mul z29.s, p2/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "mul z28.s, p2/M, z28.s, z30.s\n"
+ "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
+ "add z28.s, z28.s, z1.s\n"
+ "mul z26.s, p2/M, z26.s, z30.s\n"
+ "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+ "add z26.s, z26.s, z1.s\n"
+ "mul z25.s, p2/M, z25.s, z30.s\n"
+ "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "mul z24.s, p2/M, z24.s, z30.s\n"
+ "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+ "add z24.s, z24.s, z1.s\n"
+ "mul z23.s, p2/M, z23.s, z30.s\n"
+ "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+ "add z23.s, z23.s, z1.s\n"
+ "1:" // Loop
+ "sdot z31.s, z8.b, z7.b[0]\n"
+ "ld1w { z22.s }, p2/Z, [%x[params]]\n"
+ "incb x28\n"
+ "sdot z18.s, z8.b, z7.b[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "whilelt p0.s, x27, %x[n_channels]\n"
+ "sdot z29.s, z8.b, z7.b[2]\n"
+ "whilelt p1.b, x28, x9\n"
+ "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z28.s, z8.b, z7.b[3]\n"
+ "sdot z26.s, z8.b, z4.b[0]\n"
+ "sdot z25.s, z8.b, z4.b[1]\n"
+ "sdot z24.s, z8.b, z4.b[2]\n"
+ "sdot z23.s, z8.b, z4.b[3]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z31.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z9.b, z3.b[1]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z28.s, z9.b, z3.b[3]\n"
+ "sdot z26.s, z9.b, z6.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[1]\n"
+ "sdot z24.s, z9.b, z6.b[2]\n"
+ "sdot z23.s, z9.b, z6.b[3]\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z31.s, z10.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[1]\n"
+ "sdot z29.s, z10.b, z4.b[2]\n"
+ "sdot z28.s, z10.b, z4.b[3]\n"
+ "sdot z26.s, z10.b, z5.b[0]\n"
+ "sdot z25.s, z10.b, z5.b[1]\n"
+ "sdot z24.s, z10.b, z5.b[2]\n"
+ "sdot z23.s, z10.b, z5.b[3]\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n"
+ ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
+ ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
+ ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
+ ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
+ "and z20.d, z31.d, z21.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z18.d, z21.d\n"
+ "and z14.d, z29.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z17.d, z28.d, z21.d\n"
+ "and z2.d, z26.d, z21.d\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ "sqadd z18.s, z18.s, z19.s\n"
+ "sqadd z29.s, z29.s, z14.s\n"
+ "and z27.d, z25.d, z21.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z17.s\n"
+ "sqadd z26.s, z26.s, z2.s\n"
+ "and z17.d, z24.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z15.d, z23.d, z21.d\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z27.s\n"
+ ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z18.s, z18.s, z12.s\n"
+ "sqadd z23.s, z23.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z0.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "smin z18.s, p2/M, z18.s, z0.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "smax z31.s, p2/M, z31.s, z16.s\n"
+ "st1b { z31.s }, p0, [x26, x27]\n"
+ "add z28.s, z28.s, z12.s\n"
+ "smax z18.s, p2/M, z18.s, z16.s\n"
+ "ld1w { z31.s }, p2/Z, [SP]\n"
+ "smin z29.s, p2/M, z29.s, z0.s\n"
+ "st1b { z18.s }, p0, [x25, x27]\n"
+ "add z31.s, z31.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
+ "smax z29.s, p2/M, z29.s, z16.s\n"
+ "st1b { z29.s }, p0, [x24, x27]\n"
+ "add z18.s, z18.s, z1.s\n"
+ "smax z28.s, p2/M, z28.s, z16.s\n"
+ "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "st1b { z28.s }, p0, [x23, x27]\n"
+ "add z29.s, z29.s, z1.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
+ "add z26.s, z26.s, z12.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z28.s, z28.s, z1.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "smin z26.s, p2/M, z26.s, z0.s\n"
+ "smin z25.s, p2/M, z25.s, z0.s\n"
+ "smin z24.s, p2/M, z24.s, z0.s\n"
+ "smin z23.s, p2/M, z23.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z16.s\n"
+ "st1b { z26.s }, p0, [x22, x27]\n"
+ "smax z25.s, p2/M, z25.s, z16.s\n"
+ "smax z24.s, p2/M, z24.s, z16.s\n"
+ "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+ "smax z23.s, p2/M, z23.s, z16.s\n"
+ "st1b { z25.s }, p0, [x21, x27]\n"
+ "add z26.s, z26.s, z1.s\n"
+ "st1b { z24.s }, p0, [x20, x27]\n"
+ "st1b { z23.s }, p0, [x19, x27]\n"
+ "incw x27\n"
+ "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+ "add z24.s, z24.s, z1.s\n"
+ "add z23.s, z23.s, z1.s\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..2c33bdcd3a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 8;
+ constexpr static unsigned int input_cols = 6;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+ sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ffa2c6a7bc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov z20.b, #0x1\n"
+ "ldr x24, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "mov z22.s, #0x1\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "lsl x9, %x[n_channels], #0x2\n"
+ "mov z30.s, #0x0\n"
+ "ldr x22, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "mov z28.s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "mov x20, #0x6\n"
+ "mov z29.s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x20]\n"
+ "whilelt p0.b, XZR, x20\n"
+ "mov z27.s, #0x0\n"
+ "ld1b { z0.b }, p0/Z, [x24]\n"
+ "mov x28, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x23]\n"
+ "mov x27, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22]\n"
+ "whilelt p1.b, x28, x9\n"
+ "mov z15.d, z0.d\n"
+ "ld1b { z4.b }, p0/Z, [x21]\n"
+ "mov z24.s, #0x0\n"
+ "ld1b { z6.b }, p0/Z, [x19]\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "ldr x21, [%x[inptrs], #0x28]\n"
+ "mov z16.d, z3.d\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ldr x19, [%x[inptrs], #0x38]\n"
+ "mov z18.d, z5.d\n"
+ "ld1b { z7.b }, p0/Z, [x21]\n"
+ "zip1 z0.d, z0.d, z15.d\n"
+ "ld1b { z1.b }, p0/Z, [x20]\n"
+ "mov z0.q, z0.q[0]\n"
+ "ld1b { z2.b }, p0/Z, [x19]\n"
+ "zip1 z3.d, z3.d, z16.d\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z3.q, z3.q[0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "mov z16.d, z4.d\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "mov z17.d, z6.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z5.d, z5.d, z18.d\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z5.q, z5.q[0]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z4.d, z4.d, z16.d\n"
+ "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "mov z16.d, z7.d\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "zip1 z6.d, z6.d, z17.d\n"
+ "mov z17.d, z1.d\n"
+ "mov z6.q, z6.q[0]\n"
+ "zip1 z7.d, z7.d, z16.d\n"
+ "mov z7.q, z7.q[0]\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z16.d, z2.d\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "mov z23.s, #0x0\n"
+ "zip1 z1.d, z1.d, z17.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "zip1 z2.d, z2.d, z16.d\n"
+ "mov z2.q, z2.q[0]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "sdot z30.s, z20.b, z0.b[0]\n"
+ "sdot z28.s, z20.b, z0.b[2]\n"
+ "sdot z29.s, z20.b, z3.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[2]\n"
+ "sdot z30.s, z22.b, z0.b[1]\n"
+ "sdot z28.s, z22.b, z0.b[3]\n"
+ "sdot z29.s, z22.b, z3.b[1]\n"
+ "sdot z27.s, z22.b, z3.b[3]\n"
+ "sdot z26.s, z20.b, z5.b[0]\n"
+ "sdot z25.s, z20.b, z5.b[2]\n"
+ "sdot z24.s, z20.b, z4.b[0]\n"
+ "sdot z23.s, z20.b, z4.b[2]\n"
+ "sdot z26.s, z22.b, z5.b[1]\n"
+ "sdot z25.s, z22.b, z5.b[3]\n"
+ "sdot z24.s, z22.b, z4.b[1]\n"
+ "sdot z23.s, z22.b, z4.b[3]\n"
+ "sdot z18.s, z20.b, z6.b[0]\n"
+ "sdot z17.s, z20.b, z6.b[2]\n"
+ "sdot z16.s, z20.b, z7.b[0]\n"
+ "sdot z21.s, z20.b, z7.b[2]\n"
+ "sdot z18.s, z22.b, z6.b[1]\n"
+ "sdot z17.s, z22.b, z6.b[3]\n"
+ "sdot z16.s, z22.b, z7.b[1]\n"
+ "sdot z21.s, z22.b, z7.b[3]\n"
+ "sdot z19.s, z20.b, z1.b[0]\n"
+ "mov z30.d, z30.d\n"
+ "mov z28.d, z28.d\n"
+ "add z30.s, z30.s, z29.s\n"
+ "sdot z19.s, z22.b, z1.b[1]\n"
+ "add z28.s, z28.s, z27.s\n"
+ "add z30.s, z30.s, z26.s\n"
+ "mov z29.d, z29.d\n"
+ "add z28.s, z28.s, z25.s\n"
+ "add z30.s, z30.s, z24.s\n"
+ "mov z27.d, z27.d\n"
+ "add z28.s, z28.s, z23.s\n"
+ "add z30.s, z30.s, z18.s\n"
+ "add z29.s, z29.s, z26.s\n"
+ "add z28.s, z28.s, z17.s\n"
+ "add z27.s, z27.s, z25.s\n"
+ "add z29.s, z29.s, z24.s\n"
+ "mov z26.d, z26.d\n"
+ "add z27.s, z27.s, z23.s\n"
+ "add z29.s, z29.s, z18.s\n"
+ "mov z25.d, z25.d\n"
+ "add z27.s, z27.s, z17.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "add z26.s, z26.s, z24.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z18.s\n"
+ "mov z24.d, z24.d\n"
+ "add z25.s, z25.s, z17.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "mov z23.d, z23.d\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z26.s, z26.s, z19.s\n"
+ "add z24.s, z24.s, z18.s\n"
+ "mov z18.s, #0x0\n"
+ "sdot z18.s, z20.b, z1.b[2]\n"
+ "add z23.s, z23.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "sdot z17.s, z20.b, z2.b[0]\n"
+ "sdot z18.s, z22.b, z1.b[3]\n"
+ "add z24.s, z24.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "sdot z17.s, z22.b, z2.b[1]\n"
+ "sdot z16.s, z20.b, z2.b[2]\n"
+ "add z25.s, z25.s, z18.s\n"
+ "add z23.s, z23.s, z21.s\n"
+ "add z24.s, z24.s, z19.s\n"
+ "sdot z16.s, z22.b, z2.b[3]\n"
+ "add z23.s, z23.s, z18.s\n"
+ "add z24.s, z24.s, z17.s\n"
+ "neg z15.s, p2/M, z15.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z15.s\n"
+ "st1w { z30.s }, p2, [SP]\n"
+ "add z30.s, z30.s, z13.s\n"
+ "mul z28.s, p2/M, z28.s, z15.s\n"
+ "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
+ "add z28.s, z28.s, z13.s\n"
+ "mul z29.s, p2/M, z29.s, z15.s\n"
+ "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+ "add z29.s, z29.s, z13.s\n"
+ "mul z27.s, p2/M, z27.s, z15.s\n"
+ "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
+ "add z27.s, z27.s, z13.s\n"
+ "mul z26.s, p2/M, z26.s, z15.s\n"
+ "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "mul z25.s, p2/M, z25.s, z15.s\n"
+ "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "mul z24.s, p2/M, z24.s, z15.s\n"
+ "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "mul z23.s, p2/M, z23.s, z15.s\n"
+ "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+ "add z23.s, z23.s, z13.s\n"
+ "1:" // Loop
+ "sdot z30.s, z8.b, z0.b[0]\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "incb x28\n"
+ "sdot z28.s, z8.b, z0.b[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "whilelt p0.s, x27, %x[n_channels]\n"
+ "sdot z29.s, z8.b, z3.b[0]\n"
+ "whilelt p1.b, x28, x9\n"
+ "sdot z27.s, z8.b, z3.b[2]\n"
+ "sdot z26.s, z8.b, z5.b[0]\n"
+ "sdot z25.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z4.b[0]\n"
+ "sdot z23.s, z8.b, z4.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "sdot z30.s, z9.b, z0.b[1]\n"
+ "sdot z28.s, z9.b, z0.b[3]\n"
+ "sdot z29.s, z9.b, z3.b[1]\n"
+ "sdot z27.s, z9.b, z3.b[3]\n"
+ "sdot z26.s, z9.b, z5.b[1]\n"
+ "sdot z25.s, z9.b, z5.b[3]\n"
+ "sdot z24.s, z9.b, z4.b[1]\n"
+ "sdot z23.s, z9.b, z4.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z28.s, z10.b, z3.b[2]\n"
+ "sdot z29.s, z10.b, z5.b[0]\n"
+ "sdot z27.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z4.b[0]\n"
+ "sdot z25.s, z10.b, z4.b[2]\n"
+ "sdot z24.s, z10.b, z6.b[0]\n"
+ "sdot z23.s, z10.b, z6.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z30.s, z11.b, z3.b[1]\n"
+ "sdot z28.s, z11.b, z3.b[3]\n"
+ "sdot z29.s, z11.b, z5.b[1]\n"
+ "sdot z27.s, z11.b, z5.b[3]\n"
+ "sdot z26.s, z11.b, z4.b[1]\n"
+ "sdot z25.s, z11.b, z4.b[3]\n"
+ "sdot z24.s, z11.b, z6.b[1]\n"
+ "sdot z23.s, z11.b, z6.b[3]\n"
+ "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z30.s, z8.b, z5.b[0]\n"
+ "sdot z28.s, z8.b, z5.b[2]\n"
+ "sdot z29.s, z8.b, z4.b[0]\n"
+ "sdot z27.s, z8.b, z4.b[2]\n"
+ "sdot z26.s, z8.b, z6.b[0]\n"
+ "sdot z25.s, z8.b, z6.b[2]\n"
+ "sdot z24.s, z8.b, z7.b[0]\n"
+ "sdot z23.s, z8.b, z7.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z30.s, z9.b, z5.b[1]\n"
+ "sdot z28.s, z9.b, z5.b[3]\n"
+ "sdot z29.s, z9.b, z4.b[1]\n"
+ "sdot z27.s, z9.b, z4.b[3]\n"
+ "sdot z26.s, z9.b, z6.b[1]\n"
+ "sdot z25.s, z9.b, z6.b[3]\n"
+ "sdot z24.s, z9.b, z7.b[1]\n"
+ "sdot z23.s, z9.b, z7.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z30.s, z10.b, z4.b[0]\n"
+ "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z28.s, z10.b, z4.b[2]\n"
+ "sdot z29.s, z10.b, z6.b[0]\n"
+ "sdot z27.s, z10.b, z6.b[2]\n"
+ "sdot z26.s, z10.b, z7.b[0]\n"
+ "sdot z25.s, z10.b, z7.b[2]\n"
+ "sdot z24.s, z10.b, z1.b[0]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z30.s, z11.b, z4.b[1]\n"
+ "sdot z28.s, z11.b, z4.b[3]\n"
+ "sdot z29.s, z11.b, z6.b[1]\n"
+ "sdot z27.s, z11.b, z6.b[3]\n"
+ "sdot z26.s, z11.b, z7.b[1]\n"
+ "sdot z25.s, z11.b, z7.b[3]\n"
+ "sdot z24.s, z11.b, z1.b[1]\n"
+ "sdot z23.s, z11.b, z1.b[3]\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "sdot z30.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z6.b[2]\n"
+ "sdot z29.s, z8.b, z7.b[0]\n"
+ "sdot z27.s, z8.b, z7.b[2]\n"
+ "sdot z26.s, z8.b, z1.b[0]\n"
+ "sdot z25.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z23.s, z8.b, z2.b[2]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "sdot z30.s, z9.b, z6.b[1]\n"
+ "sdot z28.s, z9.b, z6.b[3]\n"
+ "sdot z29.s, z9.b, z7.b[1]\n"
+ "sdot z27.s, z9.b, z7.b[3]\n"
+ "sdot z26.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z1.b[3]\n"
+ "sdot z24.s, z9.b, z2.b[1]\n"
+ "sdot z23.s, z9.b, z2.b[3]\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "addvl %x[params], %x[params], #-3\n"
+ ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n"
+ ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
+ ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
+ ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n"
+ ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
+ "and z20.d, z30.d, z21.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z17.d, z27.d, z21.d\n"
+ "and z16.d, z26.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z30.s, z30.s, z20.s\n"
+ ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ "and z18.d, z25.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z17.s\n"
+ "sqadd z26.s, z26.s, z16.s\n"
+ "and z17.d, z24.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z16.d, z23.d, z21.d\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "smax z30.s, p2/M, z30.s, z31.s\n"
+ "st1b { z30.s }, p0, [x26, x27]\n"
+ "add z27.s, z27.s, z14.s\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "ld1w { z30.s }, p2/Z, [SP]\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "st1b { z28.s }, p0, [x25, x27]\n"
+ "add z30.s, z30.s, z13.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ "st1b { z29.s }, p0, [x24, x27]\n"
+ "add z28.s, z28.s, z13.s\n"
+ "smax z27.s, p2/M, z27.s, z31.s\n"
+ "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "st1b { z27.s }, p0, [x23, x27]\n"
+ "add z29.s, z29.s, z13.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
+ "add z26.s, z26.s, z14.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z23.s, z23.s, z14.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z23.s, p2/M, z23.s, z12.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "st1b { z26.s }, p0, [x22, x27]\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z24.s, p2/M, z24.s, z31.s\n"
+ "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+ "smax z23.s, p2/M, z23.s, z31.s\n"
+ "st1b { z25.s }, p0, [x21, x27]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "st1b { z24.s }, p0, [x20, x27]\n"
+ "st1b { z23.s }, p0, [x19, x27]\n"
+ "incw x27\n"
+ "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z23.s, z23.s, z13.s\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4098f6f660
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef int8_t input_type;
+ typedef int8_t weight_type;
+ typedef int8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3345449fe1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "mov x25, #0x0\n"
+ "ldp x24, x23, [%x[inptrs], #0x30]\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "ldp x22, x21, [%x[outptrs], #0x0]\n"
+ "ldp x20, x19, [%x[outptrs], #0x10]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "1:" // Loop
+ "ld1b { z19.b }, p1/Z, [x11, x25]\n"
+ "whilelt p0.s, x25, %x[n_channels]\n"
+ "ld1b { z18.b }, p1/Z, [x10, x25]\n"
+ "ldp x11, x10, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p1/Z, [x9, x25]\n"
+ "zip1 z21.b, z19.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, x25]\n"
+ "zip2 z19.b, z19.b, z16.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x50]\n"
+ "ld1b { z23.b }, p1/Z, [x27, x25]\n"
+ "zip1 z16.b, z18.b, z17.b\n"
+ "ld1b { z20.b }, p1/Z, [x26, x25]\n"
+ "zip2 z18.b, z18.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x60]\n"
+ "zip1 z3.b, z21.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x24, x25]\n"
+ "zip2 z2.b, z21.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x23, x25]\n"
+ "zip1 z29.b, z19.b, z18.b\n"
+ "ldp x24, x23, [%x[inptrs], #0x70]\n"
+ "zip2 z28.b, z19.b, z18.b\n"
+ "ld1b { z22.b }, p1/Z, [x11, x25]\n"
+ "zip1 z19.b, z23.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x10, x25]\n"
+ "zip2 z27.b, z23.b, z17.b\n"
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "zip1 z18.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x9, x25]\n"
+ "zip2 z20.b, z20.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x28, x25]\n"
+ "zip1 z1.b, z19.b, z18.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "zip2 z0.b, z19.b, z18.b\n"
+ "ld1b { z19.b }, p1/Z, [x27, x25]\n"
+ "zip1 z26.b, z22.b, z17.b\n"
+ "ld1b { z25.b }, p1/Z, [x26, x25]\n"
+ "zip2 z24.b, z22.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "zip1 z23.b, z21.b, z16.b\n"
+ "ld1b { z18.b }, p1/Z, [x24, x25]\n"
+ "zip2 z22.b, z21.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x23, x25]\n"
+ "zip1 z17.b, z27.b, z20.b\n"
+ "ldp x24, x23, [%x[inptrs], #0x30]\n"
+ "zip2 z16.b, z27.b, z20.b\n"
+ "st1b { z29.b }, p2, [SP]\n"
+ "zip1 z20.b, z19.b, z18.b\n"
+ "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+ "zip2 z19.b, z19.b, z18.b\n"
+ "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+ "zip1 z18.b, z25.b, z21.b\n"
+ "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+ "zip2 z17.b, z25.b, z21.b\n"
+ "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+ "zip1 z30.b, z26.b, z23.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z28.b, z26.b, z23.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+ "zip2 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+ "zip1 z26.b, z20.b, z18.b\n"
+ "ld1b { z25.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip2 z24.b, z20.b, z18.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+ "zip2 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+ "mov z22.d, z31.d\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z20.d, z31.d\n"
+ "mov z19.d, z31.d\n"
+ "sdot z31.s, z29.b, z3.b\n"
+ "sdot z20.s, z29.b, z1.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "sdot z31.s, z27.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z20.s, z27.b, z30.b\n"
+ "sdot z22.s, z29.b, z3.b\n"
+ "ld1b { z3.b }, p2/Z, [SP]\n"
+ "sdot z31.s, z25.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z20.s, z25.b, z26.b\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "sdot z19.s, z29.b, z1.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z22.s, z27.b, z1.b\n"
+ "ld1b { z1.b }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ "sdot z19.s, z27.b, z30.b\n"
+ "sdot z22.s, z25.b, z30.b\n"
+ "ld1b { z30.b }, p2/Z, [SP, #4, MUL VL]\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sdot z19.s, z25.b, z26.b\n"
+ "ld1b { z26.b }, p2/Z, [SP, #6, MUL VL]\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ "and z18.d, z20.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ "and z17.d, z22.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z16.d, z19.d, z21.d\n"
+ "sqadd z20.s, z20.s, z18.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "smax z31.s, p2/M, z31.s, z6.s\n"
+ "smax z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z5.s\n"
+ "st1b { z31.s }, p0, [x22, x25]\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z22.s, p2/M, z22.s, z6.s\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "smin z20.s, p2/M, z20.s, z5.s\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z25.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smax z19.s, p2/M, z19.s, z6.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z5.s\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "smin z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z20.s }, p0, [x20, x25]\n"
+ "mov z20.d, z31.d\n"
+ "st1b { z22.s }, p0, [x21, x25]\n"
+ "mov z22.d, z31.d\n"
+ "st1b { z19.s }, p0, [x19, x25]\n"
+ "mov z19.d, z31.d\n"
+ "incw x25\n"
+ "sdot z31.s, z29.b, z2.b\n"
+ "whilelt p0.s, x25, %x[n_channels]\n"
+ "sdot z20.s, z29.b, z0.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z31.s, z27.b, z0.b\n"
+ "sdot z20.s, z27.b, z28.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z22.s, z29.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [SP, #1, MUL VL]\n"
+ "sdot z31.s, z25.b, z28.b\n"
+ "sdot z20.s, z25.b, z24.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "sdot z19.s, z29.b, z0.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z22.s, z27.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [SP, #3, MUL VL]\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ "sdot z19.s, z27.b, z28.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "sdot z22.s, z25.b, z28.b\n"
+ "ld1b { z28.b }, p2/Z, [SP, #5, MUL VL]\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sdot z19.s, z25.b, z24.b\n"
+ "ld1b { z25.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ "ld1b { z24.b }, p2/Z, [SP, #7, MUL VL]\n"
+ "and z18.d, z20.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ "and z17.d, z22.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z16.d, z19.d, z21.d\n"
+ "sqadd z20.s, z20.s, z18.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "smax z31.s, p2/M, z31.s, z6.s\n"
+ "smax z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "add z22.s, z22.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z5.s\n"
+ "st1b { z31.s }, p0, [x22, x25]\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z22.s, p2/M, z22.s, z6.s\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "smin z20.s, p2/M, z20.s, z5.s\n"
+ "st1b { z20.s }, p0, [x20, x25]\n"
+ "mov z20.d, z31.d\n"
+ "smin z22.s, p2/M, z22.s, z5.s\n"
+ "st1b { z22.s }, p0, [x21, x25]\n"
+ "mov z22.d, z31.d\n"
+ "sdot z20.s, z29.b, z1.b\n"
+ "smax z19.s, p2/M, z19.s, z6.s\n"
+ "sdot z20.s, z27.b, z30.b\n"
+ "smin z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z19.s }, p0, [x19, x25]\n"
+ "mov z19.d, z31.d\n"
+ "incw x25\n"
+ "sdot z31.s, z29.b, z3.b\n"
+ "whilelt p0.s, x25, %x[n_channels]\n"
+ "sdot z20.s, z25.b, z26.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "sdot z31.s, z27.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z22.s, z29.b, z3.b\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ "sdot z31.s, z25.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z19.s, z29.b, z1.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z22.s, z27.b, z1.b\n"
+ "and z18.d, z20.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sdot z19.s, z27.b, z30.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z22.s, z25.b, z30.b\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "sdot z19.s, z25.b, z26.b\n"
+ "ld1b { z25.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ "sqadd z20.s, z20.s, z18.s\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z17.d, z22.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ "and z16.d, z19.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "smax z20.s, p2/M, z20.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z6.s\n"
+ ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add z22.s, z22.s, z4.s\n"
+ "smin z20.s, p2/M, z20.s, z5.s\n"
+ "st1b { z20.s }, p0, [x20, x25]\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z5.s\n"
+ "st1b { z31.s }, p0, [x22, x25]\n"
+ "smax z22.s, p2/M, z22.s, z6.s\n"
+ "smax z19.s, p2/M, z19.s, z6.s\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "mov z20.d, z31.d\n"
+ "smin z22.s, p2/M, z22.s, z5.s\n"
+ "st1b { z22.s }, p0, [x21, x25]\n"
+ "mov z22.d, z31.d\n"
+ "sdot z20.s, z29.b, z0.b\n"
+ "smin z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z19.s }, p0, [x19, x25]\n"
+ "mov z19.d, z31.d\n"
+ "incw x25\n"
+ "sdot z31.s, z29.b, z2.b\n"
+ "whilelt p0.s, x25, %x[n_channels]\n"
+ "sdot z20.s, z27.b, z28.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z31.s, z27.b, z0.b\n"
+ "sdot z20.s, z25.b, z24.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "sdot z22.s, z29.b, z2.b\n"
+ "sdot z31.s, z25.b, z28.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "sdot z19.s, z29.b, z0.b\n"
+ "sdot z22.s, z27.b, z0.b\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ "sdot z19.s, z27.b, z28.b\n"
+ "sdot z22.s, z25.b, z28.b\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sdot z19.s, z25.b, z24.b\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ "and z18.d, z20.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z17.d, z22.d, z21.d\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ "and z16.d, z19.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n"
+ "sqadd z19.s, z19.s, z16.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "smax z31.s, p2/M, z31.s, z6.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z5.s\n"
+ "st1b { z31.s }, p0, [x22, x25]\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z22.s, p2/M, z22.s, z6.s\n"
+ "smax z20.s, p2/M, z20.s, z6.s\n"
+ "smax z19.s, p2/M, z19.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z5.s\n"
+ "st1b { z22.s }, p0, [x21, x25]\n"
+ "smin z20.s, p2/M, z20.s, z5.s\n"
+ "smin z19.s, p2/M, z19.s, z5.s\n"
+ "st1b { z20.s }, p0, [x20, x25]\n"
+ "st1b { z19.s }, p0, [x19, x25]\n"
+ "incw x25\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..72b26a50a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_dot::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_dot::get_packed_size;
+
+ kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+ sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ca6af57171
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+ __asm__ __volatile__(
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "mov x19, #0x1\n"
+ "ldp x25, x24, [%x[inptrs], #0x30]\n"
+ "orr x19, x19, #0x100\n"
+ "ldp x23, x22, [%x[outptrs], #0x0]\n"
+ "orr x19, x19, #0x10000\n"
+ "dup z12.s, w19\n"
+ "ldp x21, x20, [%x[outptrs], #0x10]\n"
+ "mov x19, #0x0\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "whilelt p1.b, x19, %x[n_channels]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "1:" // Loop
+ "mov z7.s, #0x0\n"
+ "ld1b { z19.b }, p1/Z, [x11, x19]\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "mov z6.s, #0x0\n"
+ "ld1b { z18.b }, p1/Z, [x10, x19]\n"
+ "ldp x11, x10, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p1/Z, [x9, x19]\n"
+ "zip1 z21.b, z19.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x28, x19]\n"
+ "zip2 z19.b, z19.b, z16.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x50]\n"
+ "ld1b { z23.b }, p1/Z, [x27, x19]\n"
+ "zip1 z16.b, z18.b, z17.b\n"
+ "ld1b { z20.b }, p1/Z, [x26, x19]\n"
+ "zip2 z18.b, z18.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x60]\n"
+ "zip1 z5.b, z21.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x25, x19]\n"
+ "zip2 z4.b, z21.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x24, x19]\n"
+ "zip1 z29.b, z19.b, z18.b\n"
+ "ldp x25, x24, [%x[inptrs], #0x70]\n"
+ "zip2 z28.b, z19.b, z18.b\n"
+ "ld1b { z22.b }, p1/Z, [x11, x19]\n"
+ "zip1 z19.b, z23.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x10, x19]\n"
+ "zip2 z27.b, z23.b, z17.b\n"
+ "ldp x11, x10, [%x[inptrs], #0x0]\n"
+ "zip1 z18.b, z20.b, z16.b\n"
+ "ld1b { z17.b }, p1/Z, [x9, x19]\n"
+ "zip2 z20.b, z20.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x28, x19]\n"
+ "zip1 z3.b, z19.b, z18.b\n"
+ "ldp x9, x28, [%x[inptrs], #0x10]\n"
+ "zip2 z2.b, z19.b, z18.b\n"
+ "ld1b { z19.b }, p1/Z, [x27, x19]\n"
+ "zip1 z26.b, z22.b, z17.b\n"
+ "ld1b { z25.b }, p1/Z, [x26, x19]\n"
+ "zip2 z24.b, z22.b, z17.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x20]\n"
+ "zip1 z23.b, z21.b, z16.b\n"
+ "ld1b { z18.b }, p1/Z, [x25, x19]\n"
+ "zip2 z22.b, z21.b, z16.b\n"
+ "ld1b { z21.b }, p1/Z, [x24, x19]\n"
+ "zip1 z17.b, z27.b, z20.b\n"
+ "ldp x25, x24, [%x[inptrs], #0x30]\n"
+ "zip2 z16.b, z27.b, z20.b\n"
+ "st1b { z29.b }, p2, [SP]\n"
+ "zip1 z20.b, z19.b, z18.b\n"
+ "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+ "zip2 z19.b, z19.b, z18.b\n"
+ "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+ "zip1 z18.b, z25.b, z21.b\n"
+ "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+ "zip2 z17.b, z25.b, z21.b\n"
+ "ld1w { z1.s }, p2/Z, [%x[params]]\n"
+ "zip1 z0.b, z26.b, z23.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z30.b, z26.b, z23.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+ "zip2 z16.b, z24.b, z22.b\n"
+ "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+ "zip1 z28.b, z20.b, z18.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip2 z26.b, z20.b, z18.b\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+ "zip2 z16.b, z19.b, z17.b\n"
+ "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+ "mov z24.d, z1.d\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z22.d, z1.d\n"
+ "mov z21.d, z1.d\n"
+ "udot z1.s, z31.b, z5.b\n"
+ "udot z22.s, z31.b, z3.b\n"
+ "udot z7.s, z12.b, z3.b\n"
+ "udot z1.s, z29.b, z3.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "udot z22.s, z29.b, z0.b\n"
+ "udot z7.s, z12.b, z0.b\n"
+ "udot z1.s, z27.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "udot z22.s, z27.b, z28.b\n"
+ "mov z20.d, z7.d\n"
+ "udot z7.s, z12.b, z5.b\n"
+ "udot z20.s, z12.b, z28.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "udot z21.s, z31.b, z3.b\n"
+ "udot z6.s, z12.b, z3.b\n"
+ "udot z24.s, z31.b, z5.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "udot z21.s, z29.b, z0.b\n"
+ "udot z6.s, z12.b, z0.b\n"
+ "udot z24.s, z29.b, z3.b\n"
+ "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "udot z21.s, z27.b, z28.b\n"
+ "mov z19.d, z6.d\n"
+ "udot z24.s, z27.b, z0.b\n"
+ "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
+ "udot z6.s, z12.b, z5.b\n"
+ "ld1b { z5.b }, p2/Z, [SP]\n"
+ "udot z19.s, z12.b, z28.b\n"
+ "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "mov z7.s, #0x0\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "udot z7.s, z12.b, z2.b\n"
+ "mov z6.s, #0x0\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ "udot z7.s, z12.b, z30.b\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z17.d, z22.d, z23.d\n"
+ "mov z20.d, z7.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "udot z7.s, z12.b, z4.b\n"
+ "udot z20.s, z12.b, z26.b\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z22.s, z22.s, z8.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "mov z22.d, z1.d\n"
+ "udot z22.s, z31.b, z2.b\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "udot z22.s, z29.b, z30.b\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "udot z1.s, z31.b, z4.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "udot z22.s, z27.b, z26.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "udot z1.s, z29.b, z2.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "udot z24.s, z31.b, z4.b\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "udot z1.s, z27.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "udot z21.s, z31.b, z2.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "udot z24.s, z29.b, z2.b\n"
+ "udot z6.s, z12.b, z2.b\n"
+ "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "udot z21.s, z29.b, z30.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "udot z24.s, z27.b, z30.b\n"
+ "udot z6.s, z12.b, z30.b\n"
+ "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
+ "and z17.d, z22.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "udot z21.s, z27.b, z26.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "mov z19.d, z6.d\n"
+ "udot z6.s, z12.b, z4.b\n"
+ "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
+ "udot z19.s, z12.b, z26.b\n"
+ "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "mov z7.s, #0x0\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "udot z7.s, z12.b, z3.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mov z6.s, #0x0\n"
+ "udot z7.s, z12.b, z0.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "mov z20.d, z7.d\n"
+ "udot z7.s, z12.b, z5.b\n"
+ "udot z20.s, z12.b, z28.b\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "mov z22.d, z1.d\n"
+ "udot z22.s, z31.b, z3.b\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "udot z1.s, z31.b, z5.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "udot z22.s, z29.b, z0.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "udot z1.s, z29.b, z3.b\n"
+ "udot z22.s, z27.b, z28.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "udot z24.s, z31.b, z5.b\n"
+ "udot z1.s, z27.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "udot z21.s, z31.b, z3.b\n"
+ "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z29.b, z3.b\n"
+ "udot z6.s, z12.b, z3.b\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "udot z21.s, z29.b, z0.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z24.s, z27.b, z0.b\n"
+ "udot z6.s, z12.b, z0.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "udot z21.s, z27.b, z28.b\n"
+ "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "mov z7.s, #0x0\n"
+ "mov z19.d, z6.d\n"
+ "udot z6.s, z12.b, z5.b\n"
+ "udot z19.s, z12.b, z28.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "udot z7.s, z12.b, z2.b\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mov z6.s, #0x0\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z17.d, z22.d, z23.d\n"
+ "and z16.d, z21.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "udot z7.s, z12.b, z30.b\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "mov z20.d, z7.d\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "udot z7.s, z12.b, z4.b\n"
+ "udot z20.s, z12.b, z26.b\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "mov z24.d, z1.d\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "mov z22.d, z1.d\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "mov z21.d, z1.d\n"
+ "incw x19\n"
+ "udot z1.s, z31.b, z4.b\n"
+ "whilelt p0.s, x19, %x[n_channels]\n"
+ "udot z22.s, z31.b, z2.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "udot z1.s, z29.b, z2.b\n"
+ "udot z22.s, z29.b, z30.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "udot z24.s, z31.b, z4.b\n"
+ "udot z1.s, z27.b, z30.b\n"
+ "udot z22.s, z27.b, z26.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "udot z21.s, z31.b, z2.b\n"
+ "udot z24.s, z29.b, z2.b\n"
+ "udot z6.s, z12.b, z2.b\n"
+ "mls z1.s, p2/M, z7.s, z9.s\n"
+ "udot z21.s, z29.b, z30.b\n"
+ "udot z24.s, z27.b, z30.b\n"
+ "udot z6.s, z12.b, z30.b\n"
+ ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n"
+ "udot z21.s, z27.b, z26.b\n"
+ "mls z22.s, p2/M, z20.s, z9.s\n"
+ "mov z19.d, z6.d\n"
+ "udot z6.s, z12.b, z4.b\n"
+ "udot z19.s, z12.b, z26.b\n"
+ "and z16.d, z1.d, z23.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n"
+ "mls z24.s, p2/M, z6.s, z9.s\n"
+ "mls z21.s, p2/M, z19.s, z9.s\n"
+ ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n"
+ "and z17.d, z22.d, z23.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n"
+ "and z18.d, z24.d, z23.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z16.d, z21.d, z23.d\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ "add z1.s, z1.s, z8.s\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "smax z1.s, p2/M, z1.s, z11.s\n"
+ ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "smin z1.s, p2/M, z1.s, z10.s\n"
+ "st1b { z1.s }, p0, [x23, x19]\n"
+ "add z24.s, z24.s, z8.s\n"
+ "smax z22.s, p2/M, z22.s, z11.s\n"
+ ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "smin z22.s, p2/M, z22.s, z10.s\n"
+ "st1b { z22.s }, p0, [x21, x19]\n"
+ "add z21.s, z21.s, z8.s\n"
+ "smin z24.s, p2/M, z24.s, z10.s\n"
+ "st1b { z24.s }, p0, [x22, x19]\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smin z21.s, p2/M, z21.s, z10.s\n"
+ "st1b { z21.s }, p0, [x20, x19]\n"
+ "incw x19\n"
+ "whilelt p1.b, x19, %x[n_channels]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6174dd0e9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..2ec7f6e7ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x15, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z18.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z13.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x15, x17\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x15, x17\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "mov x19, x15\n"
+ "incw x19\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x17\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z17.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z17.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "mov z9.d, z11.d\n"
+ "ld1b { z0.h }, p4/Z, [x16]\n"
+ ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n"
+ "mov z20.d, z17.d\n"
+ "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "mov z24.d, z11.d\n"
+ "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n"
+ "mov z19.d, z17.d\n"
+ "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "mov z26.d, z11.d\n"
+ "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n"
+ "mov z23.d, z17.d\n"
+ "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n"
+ "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n"
+ "inch x16, ALL, MUL #8\n"
+ "ld1b { z8.h }, p4/Z, [x16]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n"
+ ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n"
+ ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ld1b { z31.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ld1b { z30.h }, p3/Z, [x22, x15]\n"
+ "ld1b { z29.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
+ "ld1b { z28.h }, p3/Z, [x20, x15]\n"
+ "ld1b { z27.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n"
+ "1:" // Loop
+ ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "whilelt p0.h, x14, x17\n"
+ ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "inch x16\n"
+ ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n"
+ "ldr x26, [x12, #0x38]\n"
+ ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n"
+ "ldr x25, [x12, #0x40]\n"
+ ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n"
+ "ldr x19, [x12, #0x48]\n"
+ ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n"
+ "ldr x24, [x12, #0x50]\n"
+ ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n"
+ "ldr x23, [x12, #0x58]\n"
+ ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x12, #0x60]\n"
+ ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ "ldr x21, [x12, #0x68]\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x15]\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
+ "ldr x20, [x12, #0x70]\n"
+ ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n"
+ "ld1w { z25.s }, p2/Z, [x13]\n"
+ ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n"
+ "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
+ "addvl x13, x13, #2\n"
+ ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n"
+ "uzp1 z10.s, z25.s, z16.s\n"
+ "uzp2 z22.s, z25.s, z16.s\n"
+ "ld1w { z25.s }, p2/Z, [x11]\n"
+ ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x26, x15]\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n"
+ "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
+ ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n"
+ "ld1b { z31.h }, p3/Z, [x25, x15]\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "uzp1 z21.s, z25.s, z16.s\n"
+ "uzp2 z25.s, z25.s, z16.s\n"
+ ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x24, x15]\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x15]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
+ ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x15]\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n"
+ "ld1b { z28.h }, p3/Z, [x19, x15]\n"
+ "inch x15\n"
+ ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
+ "whilelt p2.s, x15, x17\n"
+ ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n"
+ "mov x19, x15\n"
+ ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n"
+ "whilelt p1.s, x19, x17\n"
+ ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n"
+ "whilelt p3.h, x15, x17\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
+ "and z16.d, z11.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z1.d, z17.d, z25.d\n"
+ "and z27.d, z9.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n"
+ ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n"
+ "sqadd z11.s, z11.s, z16.s\n"
+ ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n"
+ "and z16.d, z20.d, z25.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z1.s\n"
+ "sqadd z9.s, z9.s, z27.s\n"
+ ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n"
+ "sqadd z20.s, z20.s, z16.s\n"
+ ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n"
+ ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
+ ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n"
+ "and z16.d, z24.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z7.d, z19.d, z25.d\n"
+ "and z3.d, z26.d, z21.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
+ "add z11.s, z11.s, z15.s\n"
+ "add z17.s, z17.s, z15.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "add z9.s, z9.s, z15.s\n"
+ "sqadd z26.s, z26.s, z3.s\n"
+ "and z16.d, z23.d, z25.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "smin z11.s, p4/M, z11.s, z14.s\n"
+ "smin z17.s, p4/M, z17.s, z14.s\n"
+ "smin z9.s, p4/M, z9.s, z14.s\n"
+ ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n"
+ ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n"
+ "smax z11.s, p4/M, z11.s, z13.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "add z20.s, z20.s, z15.s\n"
+ "add z24.s, z24.s, z15.s\n"
+ "smax z17.s, p4/M, z17.s, z13.s\n"
+ "smax z9.s, p4/M, z9.s, z13.s\n"
+ "smin z20.s, p4/M, z20.s, z14.s\n"
+ "smin z24.s, p4/M, z24.s, z14.s\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "st1b { z11.h }, p0, [x10, x14]\n"
+ "smax z20.s, p4/M, z20.s, z13.s\n"
+ ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n"
+ "smax z24.s, p4/M, z24.s, z13.s\n"
+ ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n"
+ ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n"
+ "trn1 z9.h, z9.h, z20.h\n"
+ "st1b { z9.h }, p0, [x9, x14]\n"
+ "add z19.s, z19.s, z15.s\n"
+ "add z26.s, z26.s, z15.s\n"
+ "add z23.s, z23.s, z15.s\n"
+ "smin z19.s, p4/M, z19.s, z14.s\n"
+ "smin z26.s, p4/M, z26.s, z14.s\n"
+ "smin z23.s, p4/M, z23.s, z14.s\n"
+ "smax z19.s, p4/M, z19.s, z13.s\n"
+ "smax z26.s, p4/M, z26.s, z13.s\n"
+ "smax z23.s, p4/M, z23.s, z13.s\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "st1b { z24.h }, p0, [x28, x14]\n"
+ "trn1 z26.h, z26.h, z23.h\n"
+ "st1b { z26.h }, p0, [x27, x14]\n"
+ "inch x14\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z17.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z17.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z17.s, z17.s, z16.s\n"
+ "mov z9.d, z11.d\n"
+ "ld1b { z0.h }, p4/Z, [x16]\n"
+ ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n"
+ "mov z20.d, z17.d\n"
+ "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "mov z24.d, z11.d\n"
+ "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+ ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n"
+ "mov z19.d, z17.d\n"
+ "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "mov z26.d, z11.d\n"
+ "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+ ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n"
+ "mov z23.d, z17.d\n"
+ "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+ ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n"
+ "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+ ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n"
+ "inch x16, ALL, MUL #8\n"
+ "ld1b { z8.h }, p4/Z, [x16]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n"
+ ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n"
+ ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ld1b { z31.h }, p3/Z, [x23, x15]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ld1b { z30.h }, p3/Z, [x22, x15]\n"
+ "ld1b { z29.h }, p3/Z, [x21, x15]\n"
+ ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n"
+ "ld1b { z28.h }, p3/Z, [x20, x15]\n"
+ "ld1b { z27.h }, p3/Z, [x19, x15]\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f470f78aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..bc8f0ac1d9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x7, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x8, #0x0\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z19.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z20.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x7, x5\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x7, x5\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x19, x7\n"
+ "incw x19\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x5\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z18.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z18.s, z16.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1b { z0.h }, p4/Z, [x6]\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+ "mov z18.d, z13.d\n"
+ "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+ ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
+ "mov z10.d, z16.d\n"
+ "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+ "mov z22.d, z13.d\n"
+ "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ "mov z23.d, z16.d\n"
+ "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ "inch x6, ALL, MUL #8\n"
+ "ld1b { z8.h }, p4/Z, [x6]\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n"
+ "ld1b { z30.h }, p3/Z, [x25, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n"
+ "ld1b { z28.h }, p3/Z, [x23, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "ld1b { z24.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ "1:" // Loop
+ ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
+ "ldr x23, [x16, #0x40]\n"
+ "whilelt p0.h, x8, x5\n"
+ ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n"
+ "ldr x22, [x16, #0x48]\n"
+ "inch x6\n"
+ ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
+ "ldr x21, [x16, #0x50]\n"
+ ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n"
+ "ldr x19, [x16, #0x60]\n"
+ ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n"
+ "ldr x10, [x16, #0x68]\n"
+ ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n"
+ "ldr x9, [x16, #0x70]\n"
+ ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n"
+ "ldr x28, [x16, #0x78]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
+ "ldr x27, [x16, #0x80]\n"
+ ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n"
+ "ldr x26, [x16, #0x88]\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ "ldr x25, [x16, #0x90]\n"
+ ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
+ ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
+ "ldr x24, [x16, #0x98]\n"
+ ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n"
+ "ld1b { z29.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
+ "ldr x23, [x16, #0xa0]\n"
+ ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
+ "ldr x22, [x16, #0xa8]\n"
+ ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n"
+ "ld1b { z26.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
+ "ldr x21, [x16, #0xb0]\n"
+ ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
+ "ldr x20, [x16, #0xb8]\n"
+ ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
+ "ldr x19, [x16, #0xc0]\n"
+ ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
+ "ld1w { z21.s }, p2/Z, [x17]\n"
+ ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n"
+ "ld1b { z24.h }, p3/Z, [x9, x7]\n"
+ ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
+ ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x10, x7]\n"
+ "addvl x17, x17, #2\n"
+ ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ "uzp1 z30.s, z21.s, z17.s\n"
+ "uzp2 z31.s, z21.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x15]\n"
+ ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
+ "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x7]\n"
+ ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
+ ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z27.h }, p3/Z, [x28, x7]\n"
+ ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n"
+ "ld1b { z26.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x25, x7]\n"
+ ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ "uzp1 z0.s, z21.s, z17.s\n"
+ "uzp2 z21.s, z21.s, z17.s\n"
+ ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n"
+ ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n"
+ ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ "and z4.d, z13.d, z0.d\n"
+ "and z17.d, z16.d, z21.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ "sqadd z13.s, z13.s, z4.s\n"
+ "sqadd z16.s, z16.s, z17.s\n"
+ ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n"
+ "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x19, x7]\n"
+ "inch x7\n"
+ ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n"
+ "whilelt p2.s, x7, x5\n"
+ ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
+ "mov x19, x7\n"
+ ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n"
+ "incw x19\n"
+ ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n"
+ "whilelt p1.s, x19, x5\n"
+ "and z1.d, z11.d, z0.d\n"
+ "whilelt p3.h, x7, x5\n"
+ "and z17.d, z9.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
+ ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n"
+ "sqadd z11.s, z11.s, z1.s\n"
+ "sqadd z9.s, z9.s, z17.s\n"
+ "add z13.s, z13.s, z14.s\n"
+ ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
+ ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n"
+ "and z17.d, z18.d, z0.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n"
+ ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n"
+ "smin z13.s, p4/M, z13.s, z15.s\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "and z1.d, z10.d, z21.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "add z16.s, z16.s, z14.s\n"
+ "sqadd z18.s, z18.s, z17.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n"
+ "smax z13.s, p4/M, z13.s, z20.s\n"
+ "smin z16.s, p4/M, z16.s, z15.s\n"
+ "sqadd z10.s, z10.s, z1.s\n"
+ "and z2.d, z22.d, z0.d\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
+ "smax z16.s, p4/M, z16.s, z20.s\n"
+ ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
+ ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
+ ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "st1b { z13.h }, p0, [x14, x8]\n"
+ "add z11.s, z11.s, z14.s\n"
+ "add z9.s, z9.s, z14.s\n"
+ "add z18.s, z18.s, z14.s\n"
+ "sqadd z22.s, z22.s, z2.s\n"
+ "and z16.d, z23.d, z21.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "smin z11.s, p4/M, z11.s, z15.s\n"
+ "smin z9.s, p4/M, z9.s, z15.s\n"
+ "smin z18.s, p4/M, z18.s, z15.s\n"
+ ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n"
+ ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n"
+ "smax z11.s, p4/M, z11.s, z20.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "add z10.s, z10.s, z14.s\n"
+ "add z22.s, z22.s, z14.s\n"
+ "smax z9.s, p4/M, z9.s, z20.s\n"
+ "smax z18.s, p4/M, z18.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z15.s\n"
+ "smin z22.s, p4/M, z22.s, z15.s\n"
+ "trn1 z11.h, z11.h, z9.h\n"
+ "st1b { z11.h }, p0, [x13, x8]\n"
+ "smax z10.s, p4/M, z10.s, z20.s\n"
+ ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n"
+ "smax z22.s, p4/M, z22.s, z20.s\n"
+ "trn1 z18.h, z18.h, z10.h\n"
+ "st1b { z18.h }, p0, [x12, x8]\n"
+ "add z23.s, z23.s, z14.s\n"
+ "smin z23.s, p4/M, z23.s, z15.s\n"
+ "smax z23.s, p4/M, z23.s, z20.s\n"
+ "trn1 z22.h, z22.h, z23.h\n"
+ "st1b { z22.h }, p0, [x11, x8]\n"
+ "inch x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z18.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z18.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z18.s, z16.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1b { z0.h }, p4/Z, [x6]\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+ "mov z18.d, z13.d\n"
+ "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+ ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n"
+ "mov z10.d, z16.d\n"
+ "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+ "mov z22.d, z13.d\n"
+ "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ "mov z23.d, z16.d\n"
+ "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ "inch x6, ALL, MUL #8\n"
+ "ld1b { z8.h }, p4/Z, [x6]\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x26, x7]\n"
+ ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n"
+ "ld1b { z30.h }, p3/Z, [x25, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+ ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n"
+ "ld1b { z28.h }, p3/Z, [x23, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n"
+ "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "ld1b { z24.h }, p3/Z, [x19, x7]\n"
+ ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n"
+ ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n"
+ ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n"
+ ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f025b08a29
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..95423186b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const uint8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x2, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x3, #0x0\n"
+ "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z17.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z5.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x2, x0\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x2, x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "mov x19, x2\n"
+ "incw x19\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x0\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z19.s }, p2/Z, [x19]\n"
+ "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z19.s, z6.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z19.s, z6.s\n"
+ "mov z19.d, z11.d\n"
+ "ld1b { z0.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z7.d, z11.d\n"
+ "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ "mov z6.d, z16.d\n"
+ "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z12.d, z11.d\n"
+ "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ "mov z8.d, z16.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
+ "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ "1:" // Loop
+ ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "whilelt p0.h, x3, x0\n"
+ ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
+ "ldr x19, [x5, #0x58]\n"
+ ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
+ "ldr x25, [x5, #0x60]\n"
+ ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n"
+ "ldr x24, [x5, #0x68]\n"
+ ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n"
+ "ldr x23, [x5, #0x70]\n"
+ ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n"
+ "ldr x22, [x5, #0x78]\n"
+ ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ "ldr x15, [x5, #0x80]\n"
+ ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n"
+ "ldr x19, [x5, #0x98]\n"
+ ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n"
+ "ldr x14, [x5, #0xa0]\n"
+ ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n"
+ "ldr x13, [x5, #0xa8]\n"
+ ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
+ "ld1b { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
+ "ldr x12, [x5, #0xb0]\n"
+ ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
+ "ldr x11, [x5, #0xb8]\n"
+ ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
+ "ldr x10, [x5, #0xc0]\n"
+ ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n"
+ "ldr x9, [x5, #0xc8]\n"
+ ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n"
+ "ldr x28, [x5, #0xd0]\n"
+ ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n"
+ "ldr x27, [x5, #0xd8]\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "ld1b { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ "ldr x26, [x5, #0xe0]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
+ "ldr x25, [x5, #0xe8]\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
+ "ld1w { z18.s }, p2/Z, [x4]\n"
+ ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n"
+ "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
+ "addvl x4, x4, #2\n"
+ ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
+ "ld1b { z3.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "uzp1 z21.s, z18.s, z20.s\n"
+ "uzp2 z10.s, z18.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x6]\n"
+ ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
+ "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
+ "addvl x6, x6, #2\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n"
+ "ldr x24, [x5, #0xf0]\n"
+ ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n"
+ "ldr x23, [x5, #0xf8]\n"
+ ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n"
+ "uzp1 z29.s, z18.s, z20.s\n"
+ "uzp2 z20.s, z18.s, z20.s\n"
+ ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n"
+ "ldr x22, [x5, #0x100]\n"
+ ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n"
+ "ld1b { z23.h }, p3/Z, [x15, x2]\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n"
+ "ldr x21, [x5, #0x108]\n"
+ ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n"
+ "ldr x20, [x5, #0x110]\n"
+ ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ "ld1b { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
+ "ldr x19, [x5, #0x118]\n"
+ ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z26.h }, p3/Z, [x14, x2]\n"
+ ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
+ ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n"
+ "ld1b { z22.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n"
+ "ld1b { z25.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
+ "ld1b { z1.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n"
+ "ld1b { z24.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "ld1b { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z27.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ "ld1b { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n"
+ "ld1b { z23.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z28.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
+ "ld1b { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
+ ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
+ "ld1b { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
+ ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
+ "ld1b { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z26.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
+ ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
+ "ld1b { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
+ "ld1b { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z4.h }, p4/Z, [x1]\n"
+ "inch x1\n"
+ ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x19, x2]\n"
+ "inch x2\n"
+ ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n"
+ "whilelt p2.s, x2, x0\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "mov x19, x2\n"
+ ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n"
+ "whilelt p1.s, x19, x0\n"
+ ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n"
+ "whilelt p3.h, x2, x0\n"
+ ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n"
+ ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n"
+ ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n"
+ ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n"
+ "and z31.d, z11.d, z29.d\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z23.d, z16.d, z20.d\n"
+ "and z25.d, z19.d, z29.d\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "and z18.d, z9.d, z20.d\n"
+ ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z31.s\n"
+ ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n"
+ "sqadd z16.s, z16.s, z23.s\n"
+ "sqadd z19.s, z19.s, z25.s\n"
+ ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
+ "sqadd z9.s, z9.s, z18.s\n"
+ "and z1.d, z7.d, z29.d\n"
+ "asr z1.s, z1.s, #0x1f\n"
+ "and z18.d, z6.d, z20.d\n"
+ ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n"
+ "and z30.d, z12.d, z29.d\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "add z11.s, z11.s, z14.s\n"
+ "sqadd z7.s, z7.s, z1.s\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "smin z11.s, p4/M, z11.s, z15.s\n"
+ ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n"
+ "sqadd z12.s, z12.s, z30.s\n"
+ "and z3.d, z8.d, z20.d\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "add z16.s, z16.s, z14.s\n"
+ "smax z11.s, p4/M, z11.s, z5.s\n"
+ ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n"
+ ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n"
+ "smin z16.s, p4/M, z16.s, z15.s\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "add z19.s, z19.s, z14.s\n"
+ "add z9.s, z9.s, z14.s\n"
+ "sqadd z8.s, z8.s, z3.s\n"
+ "add z7.s, z7.s, z14.s\n"
+ "smax z16.s, p4/M, z16.s, z5.s\n"
+ "smin z19.s, p4/M, z19.s, z15.s\n"
+ "smin z9.s, p4/M, z9.s, z15.s\n"
+ "smin z7.s, p4/M, z7.s, z15.s\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "st1b { z11.h }, p0, [x7, x3]\n"
+ "smax z19.s, p4/M, z19.s, z5.s\n"
+ "smax z9.s, p4/M, z9.s, z5.s\n"
+ "smax z7.s, p4/M, z7.s, z5.s\n"
+ ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n"
+ ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n"
+ "trn1 z19.h, z19.h, z9.h\n"
+ "st1b { z19.h }, p0, [x8, x3]\n"
+ "add z6.s, z6.s, z14.s\n"
+ ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n"
+ "add z12.s, z12.s, z14.s\n"
+ "smin z6.s, p4/M, z6.s, z15.s\n"
+ "add z8.s, z8.s, z14.s\n"
+ "smin z12.s, p4/M, z12.s, z15.s\n"
+ "smax z6.s, p4/M, z6.s, z5.s\n"
+ "smin z8.s, p4/M, z8.s, z15.s\n"
+ "smax z12.s, p4/M, z12.s, z5.s\n"
+ "trn1 z7.h, z7.h, z6.h\n"
+ "st1b { z7.h }, p0, [x17, x3]\n"
+ "smax z8.s, p4/M, z8.s, z5.s\n"
+ "trn1 z12.h, z12.h, z8.h\n"
+ "st1b { z12.h }, p0, [x16, x3]\n"
+ "inch x3\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z19.s }, p2/Z, [x19]\n"
+ "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z11.s, z19.s, z6.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z16.s, z19.s, z6.s\n"
+ "mov z19.d, z11.d\n"
+ "ld1b { z0.h }, p4/Z, [x1]\n"
+ ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n"
+ "mov z9.d, z16.d\n"
+ "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z7.d, z11.d\n"
+ "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n"
+ "mov z6.d, z16.d\n"
+ "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z12.d, z11.d\n"
+ "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ "mov z8.d, z16.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n"
+ "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..9226a96662
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 9;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+ sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..bb9931c20f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov z31.s, #0x0\n"
+ "ldr x24, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "mov z18.s, #0x0\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "lsl x9, %x[n_channels], #0x2\n"
+ "mov z29.s, #0x0\n"
+ "ldr x22, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "mov z28.s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "mov x19, #0x9\n"
+ "mov z13.s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "mov z14.s, #0x0\n"
+ "ld1b { z7.b }, p1/Z, [x24]\n"
+ "mov x19, #0x3\n"
+ "mov z15.s, #0x0\n"
+ "ld1b { z3.b }, p1/Z, [x23]\n"
+ "whilelt p0.b, XZR, x19\n"
+ "mov z11.b, p0/z, #0x1\n"
+ "ld1b { z4.b }, p1/Z, [x22]\n"
+ "mov x28, #0x0\n"
+ "mov z10.d, z7.d\n"
+ "ld1b { z6.b }, p1/Z, [x21]\n"
+ "mov x27, #0x0\n"
+ "ext z10.b, z10.b, z10.b, #0x2\n"
+ "ld1b { z5.b }, p1/Z, [x20]\n"
+ "whilelt p1.b, x28, x9\n"
+ "mov z17.d, z7.d\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z26.d, z7.d\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "mov z19.d, z3.d\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "ext z19.b, z19.b, z19.b, #0x2\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z7.s, z7.s, z17.s\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "zip1 z10.s, z10.s, z26.s\n"
+ "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z7.s, z7.s, z10.s\n"
+ "ld1w { z1.s }, p1/Z, [%x[params]]\n"
+ "mov z7.q, z7.q[0]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z17.d, z3.d\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z2.d, z3.d\n"
+ "mov z20.d, z4.d\n"
+ "ext z2.b, z2.b, z2.b, #0x6\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "ext z20.b, z20.b, z20.b, #0x2\n"
+ "mov z17.d, z4.d\n"
+ "zip1 z19.s, z19.s, z2.s\n"
+ "zip1 z3.s, z3.s, z19.s\n"
+ "mov z3.q, z3.q[0]\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z26.d, z4.d\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "mov z21.d, z6.d\n"
+ "zip1 z4.s, z4.s, z17.s\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "zip1 z20.s, z20.s, z26.s\n"
+ "zip1 z4.s, z4.s, z20.s\n"
+ "mov z4.q, z4.q[0]\n"
+ "mov z17.d, z6.d\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z20.d, z6.d\n"
+ "ext z20.b, z20.b, z20.b, #0x6\n"
+ "mov z19.d, z5.d\n"
+ "zip1 z6.s, z6.s, z17.s\n"
+ "ext z19.b, z19.b, z19.b, #0x2\n"
+ "zip1 z21.s, z21.s, z20.s\n"
+ "zip1 z6.s, z6.s, z21.s\n"
+ "mov z6.q, z6.q[0]\n"
+ "mov z17.d, z5.d\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "mov z20.d, z5.d\n"
+ "ext z20.b, z20.b, z20.b, #0x6\n"
+ "mov z11.s, z11.s[0]\n"
+ "zip1 z5.s, z5.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "zip1 z19.s, z19.s, z20.s\n"
+ "zip1 z5.s, z5.s, z19.s\n"
+ "mov z5.q, z5.q[0]\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z2.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "udot z18.s, z11.b, z7.b[1]\n"
+ "udot z29.s, z11.b, z7.b[2]\n"
+ "udot z28.s, z11.b, z7.b[3]\n"
+ "udot z13.s, z11.b, z3.b[0]\n"
+ "udot z14.s, z11.b, z3.b[1]\n"
+ "udot z15.s, z11.b, z3.b[2]\n"
+ "udot z25.s, z11.b, z3.b[3]\n"
+ "udot z26.s, z11.b, z4.b[0]\n"
+ "udot z27.s, z11.b, z4.b[1]\n"
+ "udot z24.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z4.b[3]\n"
+ "udot z22.s, z11.b, z6.b[0]\n"
+ "udot z21.s, z11.b, z6.b[1]\n"
+ "udot z17.s, z11.b, z6.b[2]\n"
+ "udot z20.s, z11.b, z6.b[3]\n"
+ "udot z2.s, z11.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z5.b[1]\n"
+ "mov z31.d, z31.d\n"
+ "mov z18.d, z18.d\n"
+ "mov z29.d, z29.d\n"
+ "mov z28.d, z28.d\n"
+ "add z31.s, z31.s, z13.s\n"
+ "mov z13.s, #0x0\n"
+ "udot z13.s, z11.b, z5.b[2]\n"
+ "add z18.s, z18.s, z14.s\n"
+ "mov z14.s, #0x0\n"
+ "udot z14.s, z11.b, z5.b[3]\n"
+ "add z29.s, z29.s, z15.s\n"
+ "add z28.s, z28.s, z25.s\n"
+ "add z31.s, z31.s, z26.s\n"
+ "add z18.s, z18.s, z27.s\n"
+ "add z29.s, z29.s, z24.s\n"
+ "add z28.s, z28.s, z23.s\n"
+ "mov z26.d, z26.d\n"
+ "mov z25.d, z27.d\n"
+ "mov z24.d, z24.d\n"
+ "mov z23.d, z23.d\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z24.s, z24.s, z17.s\n"
+ "add z23.s, z23.s, z20.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z23.s, z23.s, z14.s\n"
+ "neg z30.s, p2/M, z30.s\n"
+ "mul z31.s, p2/M, z31.s, z30.s\n"
+ "st1w { z31.s }, p2, [SP]\n"
+ "add z31.s, z31.s, z1.s\n"
+ "mul z18.s, p2/M, z18.s, z30.s\n"
+ "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
+ "add z18.s, z18.s, z1.s\n"
+ "mul z29.s, p2/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "mul z28.s, p2/M, z28.s, z30.s\n"
+ "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
+ "add z28.s, z28.s, z1.s\n"
+ "mul z26.s, p2/M, z26.s, z30.s\n"
+ "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+ "add z26.s, z26.s, z1.s\n"
+ "mul z25.s, p2/M, z25.s, z30.s\n"
+ "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "mul z24.s, p2/M, z24.s, z30.s\n"
+ "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+ "add z24.s, z24.s, z1.s\n"
+ "mul z23.s, p2/M, z23.s, z30.s\n"
+ "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+ "add z23.s, z23.s, z1.s\n"
+ "1:" // Loop
+ "udot z31.s, z8.b, z7.b[0]\n"
+ "ld1w { z22.s }, p2/Z, [%x[params]]\n"
+ "incb x28\n"
+ "udot z18.s, z8.b, z7.b[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "whilelt p0.s, x27, %x[n_channels]\n"
+ "udot z29.s, z8.b, z7.b[2]\n"
+ "whilelt p1.b, x28, x9\n"
+ "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "udot z28.s, z8.b, z7.b[3]\n"
+ "udot z26.s, z8.b, z4.b[0]\n"
+ "udot z25.s, z8.b, z4.b[1]\n"
+ "udot z24.s, z8.b, z4.b[2]\n"
+ "udot z23.s, z8.b, z4.b[3]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "udot z31.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z9.b, z3.b[1]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z28.s, z9.b, z3.b[3]\n"
+ "udot z26.s, z9.b, z6.b[0]\n"
+ "udot z25.s, z9.b, z6.b[1]\n"
+ "udot z24.s, z9.b, z6.b[2]\n"
+ "udot z23.s, z9.b, z6.b[3]\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "udot z31.s, z10.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[1]\n"
+ "udot z29.s, z10.b, z4.b[2]\n"
+ "udot z28.s, z10.b, z4.b[3]\n"
+ "udot z26.s, z10.b, z5.b[0]\n"
+ "udot z25.s, z10.b, z5.b[1]\n"
+ "udot z24.s, z10.b, z5.b[2]\n"
+ "udot z23.s, z10.b, z5.b[3]\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n"
+ ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
+ ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
+ ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
+ ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
+ "and z20.d, z31.d, z21.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z18.d, z21.d\n"
+ "and z14.d, z29.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z17.d, z28.d, z21.d\n"
+ "and z2.d, z26.d, z21.d\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ "sqadd z18.s, z18.s, z19.s\n"
+ "sqadd z29.s, z29.s, z14.s\n"
+ "and z27.d, z25.d, z21.d\n"
+ "asr z27.s, z27.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z17.s\n"
+ "sqadd z26.s, z26.s, z2.s\n"
+ "and z17.d, z24.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z15.d, z23.d, z21.d\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z27.s\n"
+ ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z18.s, z18.s, z12.s\n"
+ "sqadd z23.s, z23.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z0.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "smin z18.s, p2/M, z18.s, z0.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "smax z31.s, p2/M, z31.s, z16.s\n"
+ "st1b { z31.s }, p0, [x26, x27]\n"
+ "add z28.s, z28.s, z12.s\n"
+ "smax z18.s, p2/M, z18.s, z16.s\n"
+ "ld1w { z31.s }, p2/Z, [SP]\n"
+ "smin z29.s, p2/M, z29.s, z0.s\n"
+ "st1b { z18.s }, p0, [x25, x27]\n"
+ "add z31.s, z31.s, z1.s\n"
+ "smin z28.s, p2/M, z28.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
+ "smax z29.s, p2/M, z29.s, z16.s\n"
+ "st1b { z29.s }, p0, [x24, x27]\n"
+ "add z18.s, z18.s, z1.s\n"
+ "smax z28.s, p2/M, z28.s, z16.s\n"
+ "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "st1b { z28.s }, p0, [x23, x27]\n"
+ "add z29.s, z29.s, z1.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
+ "add z26.s, z26.s, z12.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z28.s, z28.s, z1.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "smin z26.s, p2/M, z26.s, z0.s\n"
+ "smin z25.s, p2/M, z25.s, z0.s\n"
+ "smin z24.s, p2/M, z24.s, z0.s\n"
+ "smin z23.s, p2/M, z23.s, z0.s\n"
+ "smax z26.s, p2/M, z26.s, z16.s\n"
+ "st1b { z26.s }, p0, [x22, x27]\n"
+ "smax z25.s, p2/M, z25.s, z16.s\n"
+ "smax z24.s, p2/M, z24.s, z16.s\n"
+ "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+ "smax z23.s, p2/M, z23.s, z16.s\n"
+ "st1b { z25.s }, p0, [x21, x27]\n"
+ "add z26.s, z26.s, z1.s\n"
+ "st1b { z24.s }, p0, [x20, x27]\n"
+ "st1b { z23.s }, p0, [x19, x27]\n"
+ "incw x27\n"
+ "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+ "add z24.s, z24.s, z1.s\n"
+ "add z23.s, z23.s, z1.s\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..3023ed16e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+ typedef uint32_t bias_type;
+ typedef uint8_t input_type;
+ typedef uint8_t weight_type;
+ typedef uint8_t return_type;
+
+ typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 8;
+ constexpr static unsigned int input_cols = 6;
+ constexpr static unsigned int input_col_quads = 1;
+
+ kern_type kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+ sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..fc1e23e897
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov z20.b, #0x1\n"
+ "ldr x24, [%x[inptrs], #0x0]\n"
+ "ptrue p2.b\n"
+ "mov z22.s, #0x1\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "lsl x9, %x[n_channels], #0x2\n"
+ "mov z30.s, #0x0\n"
+ "ldr x22, [%x[inptrs], #0x10]\n"
+ "addvl SP, SP, #-8\n"
+ "mov z28.s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "mov x20, #0x6\n"
+ "mov z29.s, #0x0\n"
+ "ldr x19, [%x[inptrs], #0x20]\n"
+ "whilelt p0.b, XZR, x20\n"
+ "mov z27.s, #0x0\n"
+ "ld1b { z0.b }, p0/Z, [x24]\n"
+ "mov x28, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x23]\n"
+ "mov x27, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "ld1b { z5.b }, p0/Z, [x22]\n"
+ "whilelt p1.b, x28, x9\n"
+ "mov z15.d, z0.d\n"
+ "ld1b { z4.b }, p0/Z, [x21]\n"
+ "mov z24.s, #0x0\n"
+ "ld1b { z6.b }, p0/Z, [x19]\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "ldr x21, [%x[inptrs], #0x28]\n"
+ "mov z16.d, z3.d\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ldr x19, [%x[inptrs], #0x38]\n"
+ "mov z18.d, z5.d\n"
+ "ld1b { z7.b }, p0/Z, [x21]\n"
+ "zip1 z0.d, z0.d, z15.d\n"
+ "ld1b { z1.b }, p0/Z, [x20]\n"
+ "mov z0.q, z0.q[0]\n"
+ "ld1b { z2.b }, p0/Z, [x19]\n"
+ "zip1 z3.d, z3.d, z16.d\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z3.q, z3.q[0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x0]\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ldp x24, x23, [%x[outptrs], #0x10]\n"
+ "mov z16.d, z4.d\n"
+ "ldp x22, x21, [%x[outptrs], #0x20]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ldp x20, x19, [%x[outptrs], #0x30]\n"
+ "mov z17.d, z6.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z5.d, z5.d, z18.d\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z5.q, z5.q[0]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z4.d, z4.d, z16.d\n"
+ "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "mov z16.d, z7.d\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "zip1 z6.d, z6.d, z17.d\n"
+ "mov z17.d, z1.d\n"
+ "mov z6.q, z6.q[0]\n"
+ "zip1 z7.d, z7.d, z16.d\n"
+ "mov z7.q, z7.q[0]\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z16.d, z2.d\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "mov z23.s, #0x0\n"
+ "zip1 z1.d, z1.d, z17.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "zip1 z2.d, z2.d, z16.d\n"
+ "mov z2.q, z2.q[0]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "udot z30.s, z20.b, z0.b[0]\n"
+ "udot z28.s, z20.b, z0.b[2]\n"
+ "udot z29.s, z20.b, z3.b[0]\n"
+ "udot z27.s, z20.b, z3.b[2]\n"
+ "udot z30.s, z22.b, z0.b[1]\n"
+ "udot z28.s, z22.b, z0.b[3]\n"
+ "udot z29.s, z22.b, z3.b[1]\n"
+ "udot z27.s, z22.b, z3.b[3]\n"
+ "udot z26.s, z20.b, z5.b[0]\n"
+ "udot z25.s, z20.b, z5.b[2]\n"
+ "udot z24.s, z20.b, z4.b[0]\n"
+ "udot z23.s, z20.b, z4.b[2]\n"
+ "udot z26.s, z22.b, z5.b[1]\n"
+ "udot z25.s, z22.b, z5.b[3]\n"
+ "udot z24.s, z22.b, z4.b[1]\n"
+ "udot z23.s, z22.b, z4.b[3]\n"
+ "udot z18.s, z20.b, z6.b[0]\n"
+ "udot z17.s, z20.b, z6.b[2]\n"
+ "udot z16.s, z20.b, z7.b[0]\n"
+ "udot z21.s, z20.b, z7.b[2]\n"
+ "udot z18.s, z22.b, z6.b[1]\n"
+ "udot z17.s, z22.b, z6.b[3]\n"
+ "udot z16.s, z22.b, z7.b[1]\n"
+ "udot z21.s, z22.b, z7.b[3]\n"
+ "udot z19.s, z20.b, z1.b[0]\n"
+ "mov z30.d, z30.d\n"
+ "mov z28.d, z28.d\n"
+ "add z30.s, z30.s, z29.s\n"
+ "udot z19.s, z22.b, z1.b[1]\n"
+ "add z28.s, z28.s, z27.s\n"
+ "add z30.s, z30.s, z26.s\n"
+ "mov z29.d, z29.d\n"
+ "add z28.s, z28.s, z25.s\n"
+ "add z30.s, z30.s, z24.s\n"
+ "mov z27.d, z27.d\n"
+ "add z28.s, z28.s, z23.s\n"
+ "add z30.s, z30.s, z18.s\n"
+ "add z29.s, z29.s, z26.s\n"
+ "add z28.s, z28.s, z17.s\n"
+ "add z27.s, z27.s, z25.s\n"
+ "add z29.s, z29.s, z24.s\n"
+ "mov z26.d, z26.d\n"
+ "add z27.s, z27.s, z23.s\n"
+ "add z29.s, z29.s, z18.s\n"
+ "mov z25.d, z25.d\n"
+ "add z27.s, z27.s, z17.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "add z26.s, z26.s, z24.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "add z25.s, z25.s, z23.s\n"
+ "add z26.s, z26.s, z18.s\n"
+ "mov z24.d, z24.d\n"
+ "add z25.s, z25.s, z17.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "mov z23.d, z23.d\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z26.s, z26.s, z19.s\n"
+ "add z24.s, z24.s, z18.s\n"
+ "mov z18.s, #0x0\n"
+ "udot z18.s, z20.b, z1.b[2]\n"
+ "add z23.s, z23.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "udot z17.s, z20.b, z2.b[0]\n"
+ "udot z18.s, z22.b, z1.b[3]\n"
+ "add z24.s, z24.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "udot z17.s, z22.b, z2.b[1]\n"
+ "udot z16.s, z20.b, z2.b[2]\n"
+ "add z25.s, z25.s, z18.s\n"
+ "add z23.s, z23.s, z21.s\n"
+ "add z24.s, z24.s, z19.s\n"
+ "udot z16.s, z22.b, z2.b[3]\n"
+ "add z23.s, z23.s, z18.s\n"
+ "add z24.s, z24.s, z17.s\n"
+ "neg z15.s, p2/M, z15.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z15.s\n"
+ "st1w { z30.s }, p2, [SP]\n"
+ "add z30.s, z30.s, z13.s\n"
+ "mul z28.s, p2/M, z28.s, z15.s\n"
+ "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
+ "add z28.s, z28.s, z13.s\n"
+ "mul z29.s, p2/M, z29.s, z15.s\n"
+ "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+ "add z29.s, z29.s, z13.s\n"
+ "mul z27.s, p2/M, z27.s, z15.s\n"
+ "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
+ "add z27.s, z27.s, z13.s\n"
+ "mul z26.s, p2/M, z26.s, z15.s\n"
+ "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "mul z25.s, p2/M, z25.s, z15.s\n"
+ "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "mul z24.s, p2/M, z24.s, z15.s\n"
+ "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "mul z23.s, p2/M, z23.s, z15.s\n"
+ "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+ "add z23.s, z23.s, z13.s\n"
+ "1:" // Loop
+ "udot z30.s, z8.b, z0.b[0]\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "incb x28\n"
+ "udot z28.s, z8.b, z0.b[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "whilelt p0.s, x27, %x[n_channels]\n"
+ "udot z29.s, z8.b, z3.b[0]\n"
+ "whilelt p1.b, x28, x9\n"
+ "udot z27.s, z8.b, z3.b[2]\n"
+ "udot z26.s, z8.b, z5.b[0]\n"
+ "udot z25.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z4.b[0]\n"
+ "udot z23.s, z8.b, z4.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+ "udot z30.s, z9.b, z0.b[1]\n"
+ "udot z28.s, z9.b, z0.b[3]\n"
+ "udot z29.s, z9.b, z3.b[1]\n"
+ "udot z27.s, z9.b, z3.b[3]\n"
+ "udot z26.s, z9.b, z5.b[1]\n"
+ "udot z25.s, z9.b, z5.b[3]\n"
+ "udot z24.s, z9.b, z4.b[1]\n"
+ "udot z23.s, z9.b, z4.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z28.s, z10.b, z3.b[2]\n"
+ "udot z29.s, z10.b, z5.b[0]\n"
+ "udot z27.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z4.b[0]\n"
+ "udot z25.s, z10.b, z4.b[2]\n"
+ "udot z24.s, z10.b, z6.b[0]\n"
+ "udot z23.s, z10.b, z6.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "udot z30.s, z11.b, z3.b[1]\n"
+ "udot z28.s, z11.b, z3.b[3]\n"
+ "udot z29.s, z11.b, z5.b[1]\n"
+ "udot z27.s, z11.b, z5.b[3]\n"
+ "udot z26.s, z11.b, z4.b[1]\n"
+ "udot z25.s, z11.b, z4.b[3]\n"
+ "udot z24.s, z11.b, z6.b[1]\n"
+ "udot z23.s, z11.b, z6.b[3]\n"
+ "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z30.s, z8.b, z5.b[0]\n"
+ "udot z28.s, z8.b, z5.b[2]\n"
+ "udot z29.s, z8.b, z4.b[0]\n"
+ "udot z27.s, z8.b, z4.b[2]\n"
+ "udot z26.s, z8.b, z6.b[0]\n"
+ "udot z25.s, z8.b, z6.b[2]\n"
+ "udot z24.s, z8.b, z7.b[0]\n"
+ "udot z23.s, z8.b, z7.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z30.s, z9.b, z5.b[1]\n"
+ "udot z28.s, z9.b, z5.b[3]\n"
+ "udot z29.s, z9.b, z4.b[1]\n"
+ "udot z27.s, z9.b, z4.b[3]\n"
+ "udot z26.s, z9.b, z6.b[1]\n"
+ "udot z25.s, z9.b, z6.b[3]\n"
+ "udot z24.s, z9.b, z7.b[1]\n"
+ "udot z23.s, z9.b, z7.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "udot z30.s, z10.b, z4.b[0]\n"
+ "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z28.s, z10.b, z4.b[2]\n"
+ "udot z29.s, z10.b, z6.b[0]\n"
+ "udot z27.s, z10.b, z6.b[2]\n"
+ "udot z26.s, z10.b, z7.b[0]\n"
+ "udot z25.s, z10.b, z7.b[2]\n"
+ "udot z24.s, z10.b, z1.b[0]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z30.s, z11.b, z4.b[1]\n"
+ "udot z28.s, z11.b, z4.b[3]\n"
+ "udot z29.s, z11.b, z6.b[1]\n"
+ "udot z27.s, z11.b, z6.b[3]\n"
+ "udot z26.s, z11.b, z7.b[1]\n"
+ "udot z25.s, z11.b, z7.b[3]\n"
+ "udot z24.s, z11.b, z1.b[1]\n"
+ "udot z23.s, z11.b, z1.b[3]\n"
+ "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "udot z30.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z6.b[2]\n"
+ "udot z29.s, z8.b, z7.b[0]\n"
+ "udot z27.s, z8.b, z7.b[2]\n"
+ "udot z26.s, z8.b, z1.b[0]\n"
+ "udot z25.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z23.s, z8.b, z2.b[2]\n"
+ "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "udot z30.s, z9.b, z6.b[1]\n"
+ "udot z28.s, z9.b, z6.b[3]\n"
+ "udot z29.s, z9.b, z7.b[1]\n"
+ "udot z27.s, z9.b, z7.b[3]\n"
+ "udot z26.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z1.b[3]\n"
+ "udot z24.s, z9.b, z2.b[1]\n"
+ "udot z23.s, z9.b, z2.b[3]\n"
+ "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "addvl %x[params], %x[params], #-3\n"
+ ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n"
+ ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n"
+ ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n"
+ ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n"
+ ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n"
+ "and z20.d, z30.d, z21.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z17.d, z27.d, z21.d\n"
+ "and z16.d, z26.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "sqadd z30.s, z30.s, z20.s\n"
+ ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ "and z18.d, z25.d, z21.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z17.s\n"
+ "sqadd z26.s, z26.s, z16.s\n"
+ "and z17.d, z24.d, z21.d\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "and z16.d, z23.d, z21.d\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "smax z30.s, p2/M, z30.s, z31.s\n"
+ "st1b { z30.s }, p0, [x26, x27]\n"
+ "add z27.s, z27.s, z14.s\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "ld1w { z30.s }, p2/Z, [SP]\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "st1b { z28.s }, p0, [x25, x27]\n"
+ "add z30.s, z30.s, z13.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ "st1b { z29.s }, p0, [x24, x27]\n"
+ "add z28.s, z28.s, z13.s\n"
+ "smax z27.s, p2/M, z27.s, z31.s\n"
+ "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "st1b { z27.s }, p0, [x23, x27]\n"
+ "add z29.s, z29.s, z13.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
+ "add z26.s, z26.s, z14.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z23.s, z23.s, z14.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z23.s, p2/M, z23.s, z12.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "st1b { z26.s }, p0, [x22, x27]\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z24.s, p2/M, z24.s, z31.s\n"
+ "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+ "smax z23.s, p2/M, z23.s, z31.s\n"
+ "st1b { z25.s }, p0, [x21, x27]\n"
+ "add z26.s, z26.s, z13.s\n"
+ "st1b { z24.s }, p0, [x20, x27]\n"
+ "st1b { z23.s }, p0, [x19, x27]\n"
+ "incw x27\n"
+ "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+ "add z25.s, z25.s, z13.s\n"
+ "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z23.s, z23.s, z13.s\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #8\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..361f48bfbe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+ sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4fc8999ea1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x16, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x15, #0x0\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z11.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z26.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z12.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z14.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x16, x8\n"
+ "ld1rw { z17.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x16, x8\n"
+ "ldp x11, x10, [x21, #0x0]\n"
+ "mov x19, x16\n"
+ "incw x19\n"
+ "ldp x9, x28, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x8\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z10.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z10.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z15.s, z10.s, z16.s\n"
+ "mov z25.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "mov z23.d, z13.d\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "mov z9.d, z15.d\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "mov z22.d, z15.d\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
+ "mov z10.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "mov z24.d, z15.d\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n"
+ "ldp x23, x22, [x13, #0x0]\n"
+ ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n"
+ ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n"
+ "ldr x19, [x13, #0x20]\n"
+ ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n"
+ ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x16]\n"
+ "ld1b { z30.h }, p3/Z, [x22, x16]\n"
+ ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
+ "ld1b { z29.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ "ld1b { z27.h }, p3/Z, [x19, x16]\n"
+ ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
+ ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
+ ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n"
+ "1:" // Loop
+ ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n"
+ "ldr x20, [x13, #0x28]\n"
+ "whilelt p0.h, x15, x8\n"
+ ".inst 0x448447ef // smlalt z15.s, p4/M, z31.h, z4.h\n"
+ "ldr x27, [x13, #0x30]\n"
+ "inch x17\n"
+ ".inst 0x448343f9 // smlalb z25.s, p4/M, z31.h, z3.h\n"
+ "ldr x26, [x13, #0x38]\n"
+ ".inst 0x448347e9 // smlalt z9.s, p4/M, z31.h, z3.h\n"
+ "ldr x25, [x13, #0x40]\n"
+ ".inst 0x448143f7 // smlalb z23.s, p4/M, z31.h, z1.h\n"
+ "ldr x19, [x13, #0x48]\n"
+ ".inst 0x448147f6 // smlalt z22.s, p4/M, z31.h, z1.h\n"
+ "ldr x24, [x13, #0x50]\n"
+ ".inst 0x448043ea // smlalb z10.s, p4/M, z31.h, z0.h\n"
+ "ldr x23, [x13, #0x58]\n"
+ ".inst 0x448047f8 // smlalt z24.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
+ "ldr x22, [x13, #0x60]\n"
+ ".inst 0x448047cf // smlalt z15.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x16]\n"
+ ".inst 0x448243b9 // smlalb z25.s, p4/M, z29.h, z2.h\n"
+ "ldr x21, [x13, #0x68]\n"
+ ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
+ "ldr x20, [x13, #0x70]\n"
+ ".inst 0x448247a9 // smlalt z9.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x27, x16]\n"
+ ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
+ ".inst 0x4485478f // smlalt z15.s, p4/M, z28.h, z5.h\n"
+ "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
+ ".inst 0x44844399 // smlalb z25.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n"
+ "uzp1 z21.s, z19.s, z16.s\n"
+ "uzp2 z18.s, z19.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ ".inst 0x44824397 // smlalb z23.s, p4/M, z28.h, z2.h\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x44824796 // smlalt z22.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x4481438a // smlalb z10.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814798 // smlalt z24.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x26, x16]\n"
+ "uzp1 z20.s, z19.s, z16.s\n"
+ "uzp2 z19.s, z19.s, z16.s\n"
+ ".inst 0x448643f7 // smlalb z23.s, p4/M, z31.h, z6.h\n"
+ ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
+ ".inst 0x448647f6 // smlalt z22.s, p4/M, z31.h, z6.h\n"
+ "ld1b { z31.h }, p3/Z, [x25, x16]\n"
+ ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x4487476f // smlalt z15.s, p4/M, z27.h, z7.h\n"
+ ".inst 0x44864379 // smlalb z25.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
+ ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x44844776 // smlalt z22.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4483436a // smlalb z10.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x4481478f // smlalt z15.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x448843aa // smlalb z10.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x448847b8 // smlalt z24.s, p4/M, z29.h, z8.h\n"
+ "ld1b { z29.h }, p3/Z, [x24, x16]\n"
+ ".inst 0x44804399 // smlalb z25.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n"
+ "ld1b { z28.h }, p3/Z, [x23, x16]\n"
+ ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
+ ".inst 0x448247ef // smlalt z15.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
+ ".inst 0x448143f9 // smlalb z25.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n"
+ "ld1b { z31.h }, p3/Z, [x22, x16]\n"
+ ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847cf // smlalt z15.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448743d9 // smlalb z25.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
+ ".inst 0x448747c9 // smlalt z9.s, p4/M, z30.h, z7.h\n"
+ ".inst 0x448543d7 // smlalb z23.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448547d6 // smlalt z22.s, p4/M, z30.h, z5.h\n"
+ ".inst 0x448443ca // smlalb z10.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d8 // smlalt z24.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448347af // smlalt z15.s, p4/M, z29.h, z3.h\n"
+ ".inst 0x448043b7 // smlalb z23.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
+ ".inst 0x448047b6 // smlalt z22.s, p4/M, z29.h, z0.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44854399 // smlalb z25.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x4482438a // smlalb z10.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
+ ".inst 0x44824798 // smlalt z24.s, p4/M, z28.h, z2.h\n"
+ "ld1b { z28.h }, p3/Z, [x19, x16]\n"
+ "inch x16\n"
+ ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n"
+ "whilelt p2.s, x16, x8\n"
+ ".inst 0x448647ef // smlalt z15.s, p4/M, z31.h, z6.h\n"
+ "mov x19, x16\n"
+ ".inst 0x448343f7 // smlalb z23.s, p4/M, z31.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
+ "whilelt p1.s, x19, x8\n"
+ ".inst 0x448347f6 // smlalt z22.s, p4/M, z31.h, z3.h\n"
+ "whilelt p3.h, x16, x8\n"
+ ".inst 0x04b575ad // sqrdmulh z13.s, z13.s, z21.s\n"
+ ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x448843d9 // smlalb z25.s, p4/M, z30.h, z8.h\n"
+ ".inst 0x448847c9 // smlalt z9.s, p4/M, z30.h, z8.h\n"
+ "and z4.d, z13.d, z20.d\n"
+ "and z16.d, z15.d, z19.d\n"
+ ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ "sqadd z13.s, z13.s, z4.s\n"
+ "sqadd z15.s, z15.s, z16.s\n"
+ "and z2.d, z25.d, z20.d\n"
+ "and z16.d, z9.d, z19.d\n"
+ ".inst 0x448543ca // smlalb z10.s, p4/M, z30.h, z5.h\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448547d8 // smlalt z24.s, p4/M, z30.h, z5.h\n"
+ "sqadd z25.s, z25.s, z2.s\n"
+ "sqadd z9.s, z9.s, z16.s\n"
+ ".inst 0x448743b7 // smlalb z23.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448747b6 // smlalt z22.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x448643aa // smlalb z10.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x448647b8 // smlalt z24.s, p4/M, z29.h, z6.h\n"
+ ".inst 0x44884397 // smlalb z23.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44884796 // smlalt z22.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x4487438a // smlalb z10.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x44874798 // smlalt z24.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x04b576f7 // sqrdmulh z23.s, z23.s, z21.s\n"
+ ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
+ ".inst 0x04b5754a // sqrdmulh z10.s, z10.s, z21.s\n"
+ ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
+ "and z18.d, z23.d, z20.d\n"
+ "and z0.d, z22.d, z19.d\n"
+ "and z16.d, z10.d, z20.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "sqadd z22.s, z22.s, z0.s\n"
+ "sqadd z10.s, z10.s, z16.s\n"
+ "and z16.d, z24.d, z19.d\n"
+ ".inst 0x4482928d // srshl z13.s, p4/M, z13.s, z20.s\n"
+ ".inst 0x4482926f // srshl z15.s, p4/M, z15.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829299 // srshl z25.s, p4/M, z25.s, z20.s\n"
+ "add z13.s, z13.s, z12.s\n"
+ "add z15.s, z15.s, z12.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "smin z13.s, p4/M, z13.s, z17.s\n"
+ "smin z15.s, p4/M, z15.s, z17.s\n"
+ "smin z25.s, p4/M, z25.s, z17.s\n"
+ ".inst 0x44829269 // srshl z9.s, p4/M, z9.s, z19.s\n"
+ "smax z13.s, p4/M, z13.s, z14.s\n"
+ "smax z15.s, p4/M, z15.s, z14.s\n"
+ "smax z25.s, p4/M, z25.s, z14.s\n"
+ "add z9.s, z9.s, z12.s\n"
+ ".inst 0x44829297 // srshl z23.s, p4/M, z23.s, z20.s\n"
+ "trn1 z13.h, z13.h, z15.h\n"
+ "st1b { z13.h }, p0, [x11, x15]\n"
+ "smin z9.s, p4/M, z9.s, z17.s\n"
+ ".inst 0x44829276 // srshl z22.s, p4/M, z22.s, z19.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ ".inst 0x4482928a // srshl z10.s, p4/M, z10.s, z20.s\n"
+ ".inst 0x44829278 // srshl z24.s, p4/M, z24.s, z19.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "smax z9.s, p4/M, z9.s, z14.s\n"
+ "add z10.s, z10.s, z12.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "smin z23.s, p4/M, z23.s, z17.s\n"
+ "trn1 z25.h, z25.h, z9.h\n"
+ "st1b { z25.h }, p0, [x10, x15]\n"
+ "smin z22.s, p4/M, z22.s, z17.s\n"
+ "smin z10.s, p4/M, z10.s, z17.s\n"
+ "smax z23.s, p4/M, z23.s, z14.s\n"
+ "smin z24.s, p4/M, z24.s, z17.s\n"
+ "smax z22.s, p4/M, z22.s, z14.s\n"
+ "smax z10.s, p4/M, z10.s, z14.s\n"
+ "smax z24.s, p4/M, z24.s, z14.s\n"
+ "trn1 z23.h, z23.h, z22.h\n"
+ "st1b { z23.h }, p0, [x9, x15]\n"
+ "trn1 z10.h, z10.h, z24.h\n"
+ "st1b { z10.h }, p0, [x28, x15]\n"
+ "inch x15\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z10.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z10.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z15.s, z10.s, z16.s\n"
+ "mov z25.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x17]\n"
+ "mov z23.d, z13.d\n"
+ "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "mov z9.d, z15.d\n"
+ "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "mov z22.d, z15.d\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
+ "mov z10.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "mov z24.d, z15.d\n"
+ "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n"
+ "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+ ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n"
+ "ld1sb { z8.h }, p4/Z, [x17]\n"
+ ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n"
+ "ldp x23, x22, [x13, #0x0]\n"
+ ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n"
+ ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n"
+ "ldr x19, [x13, #0x20]\n"
+ ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n"
+ ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x16]\n"
+ "ld1b { z30.h }, p3/Z, [x22, x16]\n"
+ ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n"
+ "ld1b { z29.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ "ld1b { z27.h }, p3/Z, [x19, x16]\n"
+ ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n"
+ ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n"
+ ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dc33a3fe3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+ kern_type kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+ sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..63960f08e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x6, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x7, #0x0\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z16.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z14.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x6, x4\n"
+ "ld1rw { z15.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x6, x4\n"
+ "ldp x15, x14, [x21, #0x0]\n"
+ "mov x19, x6\n"
+ "incw x19\n"
+ "ldp x13, x12, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x4\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z20.s }, p2/Z, [x19]\n"
+ "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z20.s, z10.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z20.s, z20.s, z10.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x5]\n"
+ "mov z9.d, z13.d\n"
+ "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
+ "mov z18.d, z20.d\n"
+ "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
+ "mov z19.d, z20.d\n"
+ "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
+ "mov z23.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
+ "mov z21.d, z20.d\n"
+ "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
+ ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
+ "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
+ "inch x5, ALL, MUL #8\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "ld1sb { z8.h }, p4/Z, [x5]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ldp x26, x25, [x17, #0x0]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ "ldp x24, x23, [x17, #0x10]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ "ldp x22, x21, [x17, #0x20]\n"
+ ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
+ "ldp x20, x19, [x17, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x26, x6]\n"
+ ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n"
+ "ld1b { z30.h }, p3/Z, [x25, x6]\n"
+ "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+ ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n"
+ "ld1b { z28.h }, p3/Z, [x23, x6]\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ "ld1b { z27.h }, p3/Z, [x22, x6]\n"
+ "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+ ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+ "ld1b { z24.h }, p3/Z, [x19, x6]\n"
+ ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
+ ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
+ ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ "1:" // Loop
+ ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "whilelt p0.h, x7, x4\n"
+ ".inst 0x448847f4 // smlalt z20.s, p4/M, z31.h, z8.h\n"
+ "ldr x21, [x17, #0x48]\n"
+ "inch x5\n"
+ ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ ".inst 0x448647f2 // smlalt z18.s, p4/M, z31.h, z6.h\n"
+ "ldr x19, [x17, #0x58]\n"
+ ".inst 0x448243e9 // smlalb z9.s, p4/M, z31.h, z2.h\n"
+ "ldr x11, [x17, #0x60]\n"
+ ".inst 0x448247f3 // smlalt z19.s, p4/M, z31.h, z2.h\n"
+ "ldr x10, [x17, #0x68]\n"
+ ".inst 0x448043f7 // smlalb z23.s, p4/M, z31.h, z0.h\n"
+ "ldr x9, [x17, #0x70]\n"
+ ".inst 0x448047f5 // smlalt z21.s, p4/M, z31.h, z0.h\n"
+ "ldr x28, [x17, #0x78]\n"
+ ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n"
+ "ldr x27, [x17, #0x80]\n"
+ ".inst 0x448047d4 // smlalt z20.s, p4/M, z30.h, z0.h\n"
+ "ldr x26, [x17, #0x88]\n"
+ ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n"
+ "ldr x25, [x17, #0x90]\n"
+ ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x21, x6]\n"
+ ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n"
+ "ldr x24, [x17, #0x98]\n"
+ ".inst 0x448147b4 // smlalt z20.s, p4/M, z29.h, z1.h\n"
+ "ld1b { z29.h }, p3/Z, [x22, x6]\n"
+ ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n"
+ "ldr x23, [x17, #0xa0]\n"
+ ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
+ "ldr x22, [x17, #0xa8]\n"
+ ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x6]\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n"
+ "ldr x20, [x17, #0xb8]\n"
+ ".inst 0x44834754 // smlalt z20.s, p4/M, z26.h, z3.h\n"
+ "ld1b { z26.h }, p3/Z, [x19, x6]\n"
+ ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
+ "ldr x19, [x17, #0xc0]\n"
+ ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n"
+ "ld1w { z10.s }, p2/Z, [x8]\n"
+ ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n"
+ "ld1w { z22.s }, p1/Z, [x8, #1, MUL VL]\n"
+ "addvl x8, x8, #2\n"
+ ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
+ ".inst 0x44844734 // smlalt z20.s, p4/M, z25.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x11, x6]\n"
+ ".inst 0x44804712 // smlalt z18.s, p4/M, z24.h, z0.h\n"
+ "uzp1 z31.s, z10.s, z22.s\n"
+ "uzp2 z30.s, z10.s, z22.s\n"
+ "ld1w { z10.s }, p2/Z, [x16]\n"
+ ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ "ld1w { z22.s }, p1/Z, [x16, #1, MUL VL]\n"
+ "addvl x16, x16, #2\n"
+ ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824714 // smlalt z20.s, p4/M, z24.h, z2.h\n"
+ "ld1b { z24.h }, p3/Z, [x9, x6]\n"
+ ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x448447b2 // smlalt z18.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x10, x6]\n"
+ ".inst 0x44834349 // smlalb z9.s, p4/M, z26.h, z3.h\n"
+ ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ ".inst 0x44854792 // smlalt z18.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z28.h }, p3/Z, [x27, x6]\n"
+ ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x44854774 // smlalt z20.s, p4/M, z27.h, z5.h\n"
+ ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
+ ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z27.h }, p3/Z, [x28, x6]\n"
+ ".inst 0x44834753 // smlalt z19.s, p4/M, z26.h, z3.h\n"
+ "ld1b { z26.h }, p3/Z, [x26, x6]\n"
+ ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44864734 // smlalt z20.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
+ ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
+ ".inst 0x44804329 // smlalb z9.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804733 // smlalt z19.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x25, x6]\n"
+ "uzp1 z0.s, z10.s, z22.s\n"
+ "uzp2 z22.s, z10.s, z22.s\n"
+ ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n"
+ ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ ".inst 0x448447b3 // smlalt z19.s, p4/M, z29.h, z4.h\n"
+ "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+ ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44874714 // smlalt z20.s, p4/M, z24.h, z7.h\n"
+ ".inst 0x44814309 // smlalb z9.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ ".inst 0x04bf75ad // sqrdmulh z13.s, z13.s, z31.s\n"
+ ".inst 0x04be7694 // sqrdmulh z20.s, z20.s, z30.s\n"
+ ".inst 0x44814713 // smlalt z19.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x22, x6]\n"
+ ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n"
+ "and z10.d, z13.d, z0.d\n"
+ ".inst 0x44844775 // smlalt z21.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x6]\n"
+ ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "and z4.d, z20.d, z22.d\n"
+ ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
+ "sqadd z13.s, z13.s, z10.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n"
+ ".inst 0x44874792 // smlalt z18.s, p4/M, z28.h, z7.h\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ ".inst 0x44814397 // smlalb z23.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814795 // smlalt z21.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44864329 // smlalb z9.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44864733 // smlalt z19.s, p4/M, z25.h, z6.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+ ".inst 0x44854357 // smlalb z23.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854755 // smlalt z21.s, p4/M, z26.h, z5.h\n"
+ "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+ ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
+ ".inst 0x04bf756b // sqrdmulh z11.s, z11.s, z31.s\n"
+ ".inst 0x448243b7 // smlalb z23.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n"
+ ".inst 0x448247b5 // smlalt z21.s, p4/M, z29.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x19, x6]\n"
+ "inch x6\n"
+ "and z2.d, z11.d, z0.d\n"
+ "whilelt p2.s, x6, x4\n"
+ ".inst 0x44874369 // smlalb z9.s, p4/M, z27.h, z7.h\n"
+ "mov x19, x6\n"
+ "and z10.d, z18.d, z22.d\n"
+ "incw x19\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ "whilelt p1.s, x19, x4\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "whilelt p3.h, x6, x4\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ ".inst 0x44874773 // smlalt z19.s, p4/M, z27.h, z7.h\n"
+ "sqadd z11.s, z11.s, z2.s\n"
+ "sqadd z18.s, z18.s, z10.s\n"
+ ".inst 0x44854309 // smlalb z9.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44854713 // smlalt z19.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x44834317 // smlalb z23.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834715 // smlalt z21.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44884329 // smlalb z9.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x44884733 // smlalt z19.s, p4/M, z25.h, z8.h\n"
+ ".inst 0x44874357 // smlalb z23.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x44874755 // smlalt z21.s, p4/M, z26.h, z7.h\n"
+ ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n"
+ ".inst 0x04be7673 // sqrdmulh z19.s, z19.s, z30.s\n"
+ ".inst 0x44864337 // smlalb z23.s, p4/M, z25.h, z6.h\n"
+ ".inst 0x44864735 // smlalt z21.s, p4/M, z25.h, z6.h\n"
+ "and z10.d, z9.d, z0.d\n"
+ "and z24.d, z19.d, z22.d\n"
+ ".inst 0x448843b7 // smlalb z23.s, p4/M, z29.h, z8.h\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ ".inst 0x448847b5 // smlalt z21.s, p4/M, z29.h, z8.h\n"
+ "sqadd z9.s, z9.s, z10.s\n"
+ "sqadd z19.s, z19.s, z24.s\n"
+ ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n"
+ ".inst 0x04be76b5 // sqrdmulh z21.s, z21.s, z30.s\n"
+ ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "and z30.d, z23.d, z0.d\n"
+ "and z28.d, z21.d, z22.d\n"
+ "add z13.s, z13.s, z14.s\n"
+ "add z20.s, z20.s, z14.s\n"
+ "asr z30.s, z30.s, #0x1f\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "smin z13.s, p4/M, z13.s, z15.s\n"
+ "sqadd z23.s, z23.s, z30.s\n"
+ "sqadd z21.s, z21.s, z28.s\n"
+ "smin z20.s, p4/M, z20.s, z15.s\n"
+ "smax z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ "smax z20.s, p4/M, z20.s, z17.s\n"
+ ".inst 0x44829009 // srshl z9.s, p4/M, z9.s, z0.s\n"
+ "add z11.s, z11.s, z14.s\n"
+ "add z18.s, z18.s, z14.s\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "st1b { z13.h }, p0, [x15, x7]\n"
+ "add z9.s, z9.s, z14.s\n"
+ "smin z11.s, p4/M, z11.s, z15.s\n"
+ "smin z18.s, p4/M, z18.s, z15.s\n"
+ ".inst 0x448292d3 // srshl z19.s, p4/M, z19.s, z22.s\n"
+ "smin z9.s, p4/M, z9.s, z15.s\n"
+ "smax z11.s, p4/M, z11.s, z17.s\n"
+ "smax z18.s, p4/M, z18.s, z17.s\n"
+ "add z19.s, z19.s, z14.s\n"
+ "smax z9.s, p4/M, z9.s, z17.s\n"
+ ".inst 0x44829017 // srshl z23.s, p4/M, z23.s, z0.s\n"
+ "trn1 z11.h, z11.h, z18.h\n"
+ "st1b { z11.h }, p0, [x14, x7]\n"
+ "smin z19.s, p4/M, z19.s, z15.s\n"
+ ".inst 0x448292d5 // srshl z21.s, p4/M, z21.s, z22.s\n"
+ "add z23.s, z23.s, z14.s\n"
+ "add z21.s, z21.s, z14.s\n"
+ "smax z19.s, p4/M, z19.s, z17.s\n"
+ "smin z23.s, p4/M, z23.s, z15.s\n"
+ "smin z21.s, p4/M, z21.s, z15.s\n"
+ "trn1 z9.h, z9.h, z19.h\n"
+ "st1b { z9.h }, p0, [x13, x7]\n"
+ "smax z23.s, p4/M, z23.s, z17.s\n"
+ "smax z21.s, p4/M, z21.s, z17.s\n"
+ "trn1 z23.h, z23.h, z21.h\n"
+ "st1b { z23.h }, p0, [x12, x7]\n"
+ "inch x7\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z20.s }, p2/Z, [x19]\n"
+ "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z13.s, z20.s, z10.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z20.s, z20.s, z10.s\n"
+ "mov z11.d, z13.d\n"
+ "ld1sb { z0.h }, p4/Z, [x5]\n"
+ "mov z9.d, z13.d\n"
+ "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
+ "mov z18.d, z20.d\n"
+ "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
+ "mov z19.d, z20.d\n"
+ "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
+ "mov z23.d, z13.d\n"
+ "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
+ "mov z21.d, z20.d\n"
+ "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
+ ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n"
+ "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
+ "inch x5, ALL, MUL #8\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "ld1sb { z8.h }, p4/Z, [x5]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ldp x26, x25, [x17, #0x0]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ "ldp x24, x23, [x17, #0x10]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ "ldp x22, x21, [x17, #0x20]\n"
+ ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n"
+ ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n"
+ "ldp x20, x19, [x17, #0x30]\n"
+ "ld1b { z31.h }, p3/Z, [x26, x6]\n"
+ ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n"
+ "ld1b { z30.h }, p3/Z, [x25, x6]\n"
+ "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+ ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n"
+ "ld1b { z28.h }, p3/Z, [x23, x6]\n"
+ ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n"
+ "ld1b { z27.h }, p3/Z, [x22, x6]\n"
+ "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+ ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+ "ld1b { z24.h }, p3/Z, [x19, x6]\n"
+ ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n"
+ ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n"
+ ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n"
+ ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..906ef36c8f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef int32_t bias_type;
+ typedef uint8_t input_type;
+ typedef int8_t weight_type;
+ typedef uint8_t return_type;
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+ typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+ typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+ typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
+ constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+
+ kern_type kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+ sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..6c321efa29
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const int8_t *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+ "mov x2, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x3, #0x0\n"
+ "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z9.b }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z14.b }, p4/Z, [x20]\n"
+ "add x20, x22, %[offsetof_Requantize32_minval]\n"
+ "ld1rw { z17.s }, p4/Z, [x19]\n"
+ "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+ "ld1rw { z12.s }, p4/Z, [x20]\n"
+ "whilelt p3.h, x2, x0\n"
+ "ld1rw { z11.s }, p4/Z, [x19]\n"
+ "whilelt p2.s, x2, x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "mov x19, x2\n"
+ "incw x19\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "whilelt p1.s, x19, x0\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z4.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z15.s, z4.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z18.s, z4.s, z16.s\n"
+ "mov z21.d, z15.d\n"
+ "ld1sb { z0.h }, p4/Z, [x1]\n"
+ "mov z5.d, z15.d\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z13.d, z18.d\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ "mov z7.d, z18.d\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z6.d, z15.d\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ "mov z8.d, z18.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
+ "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n"
+ "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
+ ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
+ "1:" // Loop
+ ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "whilelt p0.h, x3, x0\n"
+ ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
+ "ldr x19, [x5, #0x58]\n"
+ ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n"
+ "ldr x25, [x5, #0x60]\n"
+ ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x448043a5 // smlalb z5.s, p4/M, z29.h, z0.h\n"
+ "ldr x24, [x5, #0x68]\n"
+ ".inst 0x448047a7 // smlalt z7.s, p4/M, z29.h, z0.h\n"
+ "ldr x23, [x5, #0x70]\n"
+ ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n"
+ "ldr x22, [x5, #0x78]\n"
+ ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
+ "ldr x15, [x5, #0x80]\n"
+ ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x44814375 // smlalb z21.s, p4/M, z27.h, z1.h\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ "ldr x19, [x5, #0x98]\n"
+ ".inst 0x4481476d // smlalt z13.s, p4/M, z27.h, z1.h\n"
+ "ldr x14, [x5, #0xa0]\n"
+ ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
+ "ldr x13, [x5, #0xa8]\n"
+ ".inst 0x44814385 // smlalb z5.s, p4/M, z28.h, z1.h\n"
+ "ldr x12, [x5, #0xb0]\n"
+ ".inst 0x44814787 // smlalt z7.s, p4/M, z28.h, z1.h\n"
+ "ldr x11, [x5, #0xb8]\n"
+ ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n"
+ "ldr x10, [x5, #0xc0]\n"
+ ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x4482436f // smlalb z15.s, p4/M, z27.h, z2.h\n"
+ "ldr x9, [x5, #0xc8]\n"
+ ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n"
+ "ldr x28, [x5, #0xd0]\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ "ldr x27, [x5, #0xd8]\n"
+ ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n"
+ "ldr x26, [x5, #0xe0]\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ "ld1w { z19.s }, p2/Z, [x4]\n"
+ ".inst 0x448242e5 // smlalb z5.s, p4/M, z23.h, z2.h\n"
+ "ld1w { z16.s }, p1/Z, [x4, #1, MUL VL]\n"
+ "addvl x4, x4, #2\n"
+ ".inst 0x448246e7 // smlalt z7.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ "uzp1 z10.s, z19.s, z16.s\n"
+ "uzp2 z20.s, z19.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x6]\n"
+ ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n"
+ "ld1w { z16.s }, p1/Z, [x6, #1, MUL VL]\n"
+ "addvl x6, x6, #2\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
+ "ldr x25, [x5, #0xe8]\n"
+ ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x448343e5 // smlalb z5.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x448347e7 // smlalt z7.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1]\n"
+ ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n"
+ "ldr x24, [x5, #0xf0]\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x4484476d // smlalt z13.s, p4/M, z27.h, z4.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ "ldr x23, [x5, #0xf8]\n"
+ ".inst 0x448443c5 // smlalb z5.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447c7 // smlalt z7.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x448043af // smlalb z15.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x448047b2 // smlalt z18.s, p4/M, z29.h, z0.h\n"
+ "uzp1 z29.s, z19.s, z16.s\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ "uzp2 z19.s, z19.s, z16.s\n"
+ ".inst 0x44804395 // smlalb z21.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x4480478d // smlalt z13.s, p4/M, z28.h, z0.h\n"
+ ".inst 0x448042c5 // smlalb z5.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046c7 // smlalt z7.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x4481438f // smlalb z15.s, p4/M, z28.h, z1.h\n"
+ ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n"
+ "ld1b { z28.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x448142f5 // smlalb z21.s, p4/M, z23.h, z1.h\n"
+ "ldr x22, [x5, #0x100]\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ ".inst 0x448146ed // smlalt z13.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
+ ".inst 0x44814325 // smlalb z5.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814727 // smlalt z7.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x448242ef // smlalb z15.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246f2 // smlalt z18.s, p4/M, z23.h, z2.h\n"
+ "ld1b { z23.h }, p3/Z, [x15, x2]\n"
+ ".inst 0x448243f5 // smlalb z21.s, p4/M, z31.h, z2.h\n"
+ "ldr x21, [x5, #0x108]\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x448247ed // smlalt z13.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
+ ".inst 0x44824305 // smlalb z5.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824707 // smlalt z7.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x448343ef // smlalb z15.s, p4/M, z31.h, z3.h\n"
+ ".inst 0x448347f2 // smlalt z18.s, p4/M, z31.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x448343d5 // smlalb z21.s, p4/M, z30.h, z3.h\n"
+ "ldr x20, [x5, #0x110]\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ ".inst 0x448347cd // smlalt z13.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
+ ".inst 0x44834365 // smlalb z5.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834767 // smlalt z7.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x448443cf // smlalb z15.s, p4/M, z30.h, z4.h\n"
+ ".inst 0x448447d2 // smlalt z18.s, p4/M, z30.h, z4.h\n"
+ "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n"
+ "ldr x19, [x5, #0x118]\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n"
+ "ld1b { z26.h }, p3/Z, [x14, x2]\n"
+ ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
+ ".inst 0x448442e5 // smlalb z5.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446e7 // smlalt z7.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
+ ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x448042cf // smlalb z15.s, p4/M, z22.h, z0.h\n"
+ ".inst 0x448046d2 // smlalt z18.s, p4/M, z22.h, z0.h\n"
+ "ld1b { z22.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x44804335 // smlalb z21.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ ".inst 0x4480472d // smlalt z13.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
+ ".inst 0x448043e5 // smlalb z5.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047e7 // smlalt z7.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4481432f // smlalb z15.s, p4/M, z25.h, z1.h\n"
+ ".inst 0x44814732 // smlalt z18.s, p4/M, z25.h, z1.h\n"
+ "ld1b { z25.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x44814315 // smlalb z21.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ ".inst 0x4481470d // smlalt z13.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x448143c5 // smlalb z5.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147c7 // smlalt z7.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1]\n"
+ ".inst 0x4482430f // smlalb z15.s, p4/M, z24.h, z2.h\n"
+ ".inst 0x44824712 // smlalt z18.s, p4/M, z24.h, z2.h\n"
+ "ld1b { z24.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x44824375 // smlalb z21.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x4482476d // smlalt z13.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ ".inst 0x44824345 // smlalb z5.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824747 // smlalt z7.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+ ".inst 0x4483436f // smlalb z15.s, p4/M, z27.h, z3.h\n"
+ ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z27.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x448342f5 // smlalb z21.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ ".inst 0x448346ed // smlalt z13.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+ ".inst 0x448442ef // smlalb z15.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x448446f2 // smlalt z18.s, p4/M, z23.h, z4.h\n"
+ "ld1b { z23.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x44844395 // smlalb z21.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x4484478d // smlalt z13.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z28.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
+ ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+ ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n"
+ ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
+ ".inst 0x44804365 // smlalb z5.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44804767 // smlalt z7.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n"
+ "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+ ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n"
+ ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x44814355 // smlalb z21.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ ".inst 0x4481474d // smlalt z13.s, p4/M, z26.h, z1.h\n"
+ ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
+ ".inst 0x448142e5 // smlalb z5.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+ ".inst 0x4482434f // smlalb z15.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z26.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
+ ".inst 0x448243e5 // smlalb z5.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+ ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n"
+ "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x448343c5 // smlalb z5.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n"
+ ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+ "inch x1, ALL, MUL #8\n"
+ ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x448442d5 // smlalb z21.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x448446cd // smlalt z13.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ ".inst 0x44844385 // smlalb z5.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ "ld1sb { z4.h }, p4/Z, [x1]\n"
+ "inch x1\n"
+ ".inst 0x4480436f // smlalb z15.s, p4/M, z27.h, z0.h\n"
+ ".inst 0x44804772 // smlalt z18.s, p4/M, z27.h, z0.h\n"
+ "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x448042f5 // smlalb z21.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ ".inst 0x448046ed // smlalt z13.s, p4/M, z23.h, z0.h\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x44804325 // smlalb z5.s, p4/M, z25.h, z0.h\n"
+ ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x448142ef // smlalb z15.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x448146f2 // smlalt z18.s, p4/M, z23.h, z1.h\n"
+ ".inst 0x448143f5 // smlalb z21.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x448147ed // smlalt z13.s, p4/M, z31.h, z1.h\n"
+ ".inst 0x44814305 // smlalb z5.s, p4/M, z24.h, z1.h\n"
+ ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n"
+ "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n"
+ ".inst 0x448243ef // smlalb z15.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ ".inst 0x448247f2 // smlalt z18.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448243d5 // smlalb z21.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x448247cd // smlalt z13.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824365 // smlalb z5.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z27.h }, p3/Z, [x19, x2]\n"
+ "inch x2\n"
+ ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n"
+ "whilelt p2.s, x2, x0\n"
+ ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n"
+ "mov x19, x2\n"
+ ".inst 0x448343cf // smlalb z15.s, p4/M, z30.h, z3.h\n"
+ "incw x19\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ "whilelt p1.s, x19, x0\n"
+ ".inst 0x448347d2 // smlalt z18.s, p4/M, z30.h, z3.h\n"
+ "whilelt p3.h, x2, x0\n"
+ ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x4483478d // smlalt z13.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n"
+ ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n"
+ ".inst 0x4484438f // smlalb z15.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844792 // smlalt z18.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
+ ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n"
+ ".inst 0x04aa76b5 // sqrdmulh z21.s, z21.s, z10.s\n"
+ ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n"
+ "and z28.d, z15.d, z29.d\n"
+ "and z26.d, z18.d, z19.d\n"
+ "and z16.d, z21.d, z29.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z28.s\n"
+ "sqadd z18.s, z18.s, z26.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "and z16.d, z13.d, z19.d\n"
+ ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n"
+ ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z13.s, z13.s, z16.s\n"
+ ".inst 0x04b474e7 // sqrdmulh z7.s, z7.s, z20.s\n"
+ ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n"
+ "and z16.d, z5.d, z29.d\n"
+ ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n"
+ "and z25.d, z7.d, z19.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z26.d, z6.d, z29.d\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ "sqadd z5.s, z5.s, z16.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ "sqadd z7.s, z7.s, z25.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ "sqadd z6.s, z6.s, z26.s\n"
+ ".inst 0x448293af // srshl z15.s, p4/M, z15.s, z29.s\n"
+ ".inst 0x44829272 // srshl z18.s, p4/M, z18.s, z19.s\n"
+ "and z16.d, z8.d, z19.d\n"
+ ".inst 0x448293b5 // srshl z21.s, p4/M, z21.s, z29.s\n"
+ "add z15.s, z15.s, z17.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "add z21.s, z21.s, z17.s\n"
+ "smin z15.s, p4/M, z15.s, z11.s\n"
+ "sqadd z8.s, z8.s, z16.s\n"
+ "smin z18.s, p4/M, z18.s, z11.s\n"
+ "smin z21.s, p4/M, z21.s, z11.s\n"
+ "smax z15.s, p4/M, z15.s, z12.s\n"
+ ".inst 0x4482926d // srshl z13.s, p4/M, z13.s, z19.s\n"
+ "smax z18.s, p4/M, z18.s, z12.s\n"
+ "smax z21.s, p4/M, z21.s, z12.s\n"
+ ".inst 0x448293a5 // srshl z5.s, p4/M, z5.s, z29.s\n"
+ "add z13.s, z13.s, z17.s\n"
+ "trn1 z15.h, z15.h, z18.h\n"
+ "st1b { z15.h }, p0, [x7, x3]\n"
+ "add z5.s, z5.s, z17.s\n"
+ "smin z13.s, p4/M, z13.s, z11.s\n"
+ ".inst 0x44829267 // srshl z7.s, p4/M, z7.s, z19.s\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "smin z5.s, p4/M, z5.s, z11.s\n"
+ "smax z13.s, p4/M, z13.s, z12.s\n"
+ "add z7.s, z7.s, z17.s\n"
+ "add z6.s, z6.s, z17.s\n"
+ "smax z5.s, p4/M, z5.s, z12.s\n"
+ "trn1 z21.h, z21.h, z13.h\n"
+ "st1b { z21.h }, p0, [x8, x3]\n"
+ "smin z7.s, p4/M, z7.s, z11.s\n"
+ "smin z6.s, p4/M, z6.s, z11.s\n"
+ ".inst 0x44829268 // srshl z8.s, p4/M, z8.s, z19.s\n"
+ "smax z7.s, p4/M, z7.s, z12.s\n"
+ "smax z6.s, p4/M, z6.s, z12.s\n"
+ "add z8.s, z8.s, z17.s\n"
+ "trn1 z5.h, z5.h, z7.h\n"
+ "st1b { z5.h }, p0, [x17, x3]\n"
+ "smin z8.s, p4/M, z8.s, z11.s\n"
+ "smax z8.s, p4/M, z8.s, z12.s\n"
+ "trn1 z6.h, z6.h, z8.h\n"
+ "st1b { z6.h }, p0, [x16, x3]\n"
+ "inch x3\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1w { z4.s }, p2/Z, [x19]\n"
+ "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+ "uzp1 z15.s, z4.s, z16.s\n"
+ "addvl x19, x19, #2\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp2 z18.s, z4.s, z16.s\n"
+ "mov z21.d, z15.d\n"
+ "ld1sb { z0.h }, p4/Z, [x1]\n"
+ "mov z5.d, z15.d\n"
+ "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+ "mov z13.d, z18.d\n"
+ "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+ "mov z7.d, z18.d\n"
+ "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+ "mov z6.d, z15.d\n"
+ "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+ "mov z8.d, z18.d\n"
+ "ldp x28, x27, [x5, #0x0]\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ "ldp x26, x25, [x5, #0x10]\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ "ldp x24, x23, [x5, #0x20]\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ "ldp x22, x21, [x5, #0x30]\n"
+ "ldp x20, x19, [x5, #0x40]\n"
+ "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n"
+ "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n"
+ "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n"
+ "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n"
+ ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+ ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n"
+ ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n"
+ ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n"
+ ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)